From 5a48a8275cc199432113ba46754eddcce8b75512 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 20 Jan 2026 12:22:56 +0000 Subject: [PATCH 01/43] basic skeleton for populate_workflow_status.pl script --- populate_workflow_status.pl | 234 ++++++++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 populate_workflow_status.pl diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl new file mode 100644 index 0000000..3e4a866 --- /dev/null +++ b/populate_workflow_status.pl @@ -0,0 +1,234 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use DBI; +use JSON::PP; + +use File::Basename; +use File::Spec; + +# add lib sub-folder containing modules into @INC +use lib File::Spec->catdir(File::Basename::dirname(File::Spec->rel2abs($0)), 'lib'); + +# add modules from lib-subfolder +use AuditTable; +use ABCInfo; +use Util; +use Mappings; + +use Data::Dumper; +$Data::Dumper::Sortkeys = 1; + + +=head1 NAME populate_workflow_status.pl + + +=head1 SYNOPSIS + +Used to load FlyBase curation status information into the Alliance ABC literature database for those types of curation that are stored in the ABC in the 'workflow_tag' table, because they are curation types which are relevant to the publication workflow (how a publication 'moves' through the various automated/manual curation processes at a MOD). Generates an output file containing a single json structure for all the data (the workflow_tag status data is a set of arrays within a 'data' object, plus there is a 'metaData' object to indicate source and the API to use to submit the data into the ABC). + +=cut + + +=head1 USAGE + +USAGE: perl populate_workflow_status.pl pg_server db_name pg_username pg_password dev|test|production + + +=cut + + + +=head1 DESCRIPTION + +Types of curation that are relevant to the publication workflow (and thus stored in the Alliance in 'workflow_tag') include those that are: + +o important for deciding the next step in the publication workflow process (manual or automated) + + o e.g. if a paper has been community curation via FTYP, there is no need for a FB curator to do first-pass ('skim') curation. + +o a type of curation that must have been completed before another type of curation can occur + + o e.g. a paper must have been 'thin' curated (Alliance name: 'manual_indexing') before curation of phenotypic or DO data can occur. + +Three types of FB curation are mapped to the appropriate Alliance information by this script: + +o community curation + +o first pass curation by a biocurator ('skim' curation at FB) + +o manual indexing ('thin' curation at FB) + + +(NB: In the Alliance, curation status for individual datatypes (Alliance name: 'topics') are stored in 'curation_status', NOT 'workflow_tag', so FB curation status info for these types of curation (e.g. phenotype, physical interactions) are not dealt with by this script, but instead by populate_topic_curation_status.pl). + +Script has three modes: + +o dev mode + + o single FBrf mode: asks user for FBrf number (can also use a regular expression to test multiple FBrfs in this mode). + + o Data is printed to both the json output and 'plain' output files. + + o makes error files (see below) to record any errors. + + +o test mode + + o single FBrf mode: asks user for FBrf number (must be a single FBrf number). + + o uses curl to try to POST data to the Alliance ABC stage server (so asks user for okta token for Alliance ABC stage server). + + o Data (including a record of successful curl POST events) is printed to the 'plain' output file. + + o makes error files (see below) to record any errors. + + +o production mode + + o makes data for all relevant FBrfs in chado. + + o Data is printed to the json output file. + + o makes error files (see below) to record any errors. + + +o Output files + + + o json output file (FB_workflow_status_data.json) containing a single json structure for all the data (manual_indexing status data is a set of arrays within a 'data' object, plus there is a 'metaData' object to indicate source). Data is printed to this file in all modes except 'test'. + + o 'plain' output file (FB_workflow_status_data.txt) to aid in debugging - prints the same data as in the json file, but with a single 'DATA:' tsv row for each FBrf+topic combination. Data is printed to this file in all modes except 'production'. In 'test' mode + + o FB_workflow_status_data_errors.err - errors in mapping FlyBase data to appropriate Alliance json are printed in this file. Data is printed to this file in all modes. + + o FB_workflow_status_process_errors.err - processing errors - if a curl POST fails in test mode, the failed json element and the reason for the failure are printed in this file. Expected to be empty for all other modes. + +=cut + + +if (@ARGV != 6) { + warn "Wrong number of arguments, should be 6!\n"; + warn "\n USAGE: $0 pg_server db_name pg_username pg_password dev|test|production access_token\n\n"; + warn "\teg: $0 flysql24 production_chado zhou pwd dev|test|production ABCD1234\n\n"; + exit; +} + +my $server = shift(@ARGV); +my $db = shift(@ARGV); +my $user = shift(@ARGV); +my $pwd = shift(@ARGV); +my $ENV_STATE = shift(@ARGV); +my $access_token = shift(@ARGV); + +unless ($ENV_STATE eq 'dev'|| $ENV_STATE eq 'test'|| $ENV_STATE eq 'production') { + + warn "Unknown state '$ENV_STATE': must be 'dev', 'test' or 'production'\n\n"; + exit; + +} + +my $test_FBrf = ''; + +# variable that specifies the appropriate path (downstream of the base URL) to use for the Alliance Literature Service API. +# Used to add an element in the metaData object of the output json file and in the curl command used when in test mode. +my $api_endpoint = 'workflow_tag'; + + + +if ($ENV_STATE eq "test") { + + print STDERR "You are about to write data to the stage Alliance literature server\n"; + print STDERR "Type y to continue else anything else to stop:\n"; + + my $continue = ; + chomp $continue; + if (($continue eq 'y') || ($continue eq 'Y')) { + print STDERR "Processing will continue.\n"; + } else { + die "Processing has been cancelled.\n"; + + } + +} + +if ($ENV_STATE eq "dev" || $ENV_STATE eq "test") { + + print STDERR "FBrf to test:"; + $test_FBrf = ; + chomp $test_FBrf; + + if ($ENV_STATE eq "test") { + + unless ($test_FBrf =~ m/^FBrf[0-9]{7}$/) { + die "Only a single FBrf is allowed in test mode.\n"; + } + } +} else { + + $test_FBrf = '^FBrf[0-9]+$'; + +} + +my $dsource = sprintf("dbi:Pg:dbname=%s;host=%s;port=5432",$db,$server); +my $dbh = DBI->connect($dsource,$user,$pwd) or die "cannot connect to $dsource\n"; + +my $json_encoder = JSON::PP->new()->pretty(1)->canonical(1); + + + +# add mapping hash here + +my $workflow_tag_mapping = { + + + + +}; + + +# structure of the required json for each workflow_tag element +#{ +# "date_created": $timestamp, +# "date_updated": $timestamp, +# "created_by": $curator, +# "updated_by": $curator, +# "mod_abbreviation": "FB", +# "reference_curie": FB:$FBrf, +# "workflow_tag_id": $ATP, # this is the ATP term that describes the status of the particular type of curation +# "curation_tag": # this is an ATP ID is for the 'controlled_note' - most are negative +# "note": # this is a free text note +#} + + +# open output and error logging files + +#open my $json_output_file, '>', "FB_workflow_status_data.json" +# or die "Can't open json output file ($!)\n"; + +#open my $data_error_file, '>', "FB_workflow_status_data_errors.err" +# or die "Can't open data error logging file ($!)\n"; + +#open my $process_error_file, '>', "FB_workflow_status_process_errors.err" +# or die "Can't open processing error logging file ($!)\n"; + + +#open my $plain_output_file, '>', "FB_workflow_status_data.txt" +# or die "Can't open plain output file ($!)\n"; + + +print STDERR "##Starting processing: " . (scalar localtime) . "\n"; + +##add code here## + + + +#close $json_output_file; +#close $data_error_file; +#close $process_error_file; +#close $plain_output_file; + +print STDERR "##Ended processing: " . (scalar localtime) . "\n"; + From 6720f78b85da55b03be83f8f05e27463ad8ba9db Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 20 Jan 2026 14:21:19 +0000 Subject: [PATCH 02/43] tightening up regex for cam_full so will work for both phenotype and manual_indexing status --- lib/AuditTable.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index 4ea5e39..0148520 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -496,7 +496,7 @@ Arguments: 'chemical' => '.+?\.chem\..+?', 'args' => '.+?\.args\..+?', 'phen' => '[a-z][a-z][0-9]{1,}\.phen', - 'cam_full' => '(ma|sb|rd|sf|tj|al|sm|cm|pm|gm|ao|cp|lp|sp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?', + 'cam_full' => '(ma|sb|rd|al|sm|cm|pm|gm|lp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?', # papers that should have had full curation of both any genetic reagent data AND any phenotype data 'humanhealth' => '.+?\.(hh|hh_multiple|hds|hds_multiple|hh_[0-9]{1,}|hds_rvw)\..+?', From 9f077547e27b3a86b0b416ddf61ded88885b2dc6 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 20 Jan 2026 15:13:54 +0000 Subject: [PATCH 03/43] adding get relevant data for working out status --- lib/AuditTable.pm | 11 +++++++++-- populate_workflow_status.pl | 14 +++++++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index 0148520..f56d032 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -470,7 +470,7 @@ sub get_relevant_currec_for_datatype { Title: get_relevant_currec_for_datatype Usage: get_relevant_currec_for_datatype(database_handle,datatype); - Function: Gets timestamp information for curation records that are expected to contain data of a particular type based on the their filename, for the datatype specified in the second argument. + Function: Gets timestamp information for curation records that are expected to contain data/curation of a particular type based on the their filename, for the data/curation type specified in the second argument. Example: my $currec_for_cell_line = &get_relevant_currec_for_datatype($dbh,'cell_line'); Arguments: @@ -488,6 +488,7 @@ Arguments: # only use standard filenames for matching so that can be sure that curation of the datatype is complete my $datatype_mapping = { + # filename types used for topic curation status 'cell_line' => '.+?\.(cell|cell_multiple|cell_multi)\..+?', 'phys_int' => '.+?\.(int|int_miRNA)\..+?', 'DO' => '.+?\.DO\..+?', @@ -496,8 +497,14 @@ Arguments: 'chemical' => '.+?\.chem\..+?', 'args' => '.+?\.args\..+?', 'phen' => '[a-z][a-z][0-9]{1,}\.phen', - 'cam_full' => '(ma|sb|rd|al|sm|cm|pm|gm|lp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?', # papers that should have had full curation of both any genetic reagent data AND any phenotype data 'humanhealth' => '.+?\.(hh|hh_multiple|hds|hds_multiple|hh_[0-9]{1,}|hds_rvw)\..+?', + # filename types used for workflow status + 'thin' => '[a-z][a-z][0-9]{1,}\.thin', + 'user' => '[a-z][a-z][0-9]{1,}\.user', + 'skim' => '.+?\.(skim|skim_multiple|skim_multi)(\..+)?', + 'gene_full' => '[a-z][a-z][0-9]{1,}\.full', # publications that have had full manual indexing (listing) of genes, and genes are the only entity type relevant to manual indexing in the publication + # filename types used for both topic curation status AND workflow status + 'cam_full' => '(ma|sb|rd|al|sm|cm|pm|gm|lp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?', # publications that should have had full curation of both any genetic reagent data AND any phenotype data diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 3e4a866..01b193d 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -221,8 +221,20 @@ =head1 DESCRIPTION print STDERR "##Starting processing: " . (scalar localtime) . "\n"; -##add code here## +## 1. get relevant data for deciding curation status for the various type of workflow_tag curation + +# information for community curation status +my ($user_by_timestamp, $user_by_curator) = &get_relevant_currec_for_datatype($dbh,'user'); + +# information for first pass curation status +my ($skim_by_timestamp, $skim_by_curator) = &get_relevant_currec_for_datatype($dbh,'skim'); + +# information for manual indexing status +my ($thin_by_timestamp, $thin_by_curator) = &get_relevant_currec_for_datatype($dbh,'thin'); +my ($cam_full_by_timestamp, $cam_full_by_curator) = &get_relevant_currec_for_datatype($dbh,'cam_full'); +my ($gene_full_by_timestamp, $gene_full_by_curator) = &get_relevant_currec_for_datatype($dbh,'gene_full'); +my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur$'); #close $json_output_file; From f344a8107935b2fcfdd0a5f0c37534a663c87528 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 20 Jan 2026 15:21:14 +0000 Subject: [PATCH 04/43] get publications that have genetic data (will be used for validation) --- lib/Util.pm | 3 +++ populate_workflow_status.pl | 3 +++ 2 files changed, 6 insertions(+) diff --git a/lib/Util.pm b/lib/Util.pm index 19711d1..f2dbc9a 100644 --- a/lib/Util.pm +++ b/lib/Util.pm @@ -195,6 +195,9 @@ o $data_type is the type of curated data. The value must be present as a key in ], +# the list of FBid types used in the query is deliberately strict, so that only publications that are about a gene or that use fly lines count as 'has genetic data' to make sure that the manual indexing status and cross-checks with nocur work correctly + 'genetic_data' => ['select distinct p.pub_id from pub p, feature f, feature_pub fp where p.is_obsolete = \'f\' and p.pub_id = fp.pub_id and f.feature_id = fp.feature_id and f.is_obsolete = \'f\' and f.uniquename ~\'^FB(gn|al|ab|ba|ti|te)\'', + ], }; diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 01b193d..b4b8786 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -236,6 +236,9 @@ =head1 DESCRIPTION my ($gene_full_by_timestamp, $gene_full_by_curator) = &get_relevant_currec_for_datatype($dbh,'gene_full'); my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur$'); +# get publications that *do* have links to genetic objects (used for validation) +my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data'); + #close $json_output_file; #close $data_error_file; From bca59bb982bca93c2958bcc8256365950e47e806 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 20 Jan 2026 15:28:40 +0000 Subject: [PATCH 05/43] get list of publications to get status for --- populate_workflow_status.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index b4b8786..e0919cb 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -239,6 +239,17 @@ =head1 DESCRIPTION # get publications that *do* have links to genetic objects (used for validation) my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data'); +## 2. Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl) +my $pub_id_to_FBrf = {}; +my $sql_query = sprintf("select p.uniquename, p.pub_id, cvt.name from pub p, cvterm cvt where p.is_obsolete = 'f' and p.type_id = cvt.cvterm_id and cvt.is_obsolete = '0' and cvt.name in ('paper', 'erratum', 'letter', 'note', 'teaching note', 'supplementary material', 'retraction', 'personal communication to FlyBase', 'review') and p.uniquename ~'%s'", $test_FBrf); + +my $db_query= $dbh->prepare ($sql_query); +$db_query->execute or die" CAN'T GET FBrf FROM CHADO:\n$sql_query)\n"; +while (my ($uniquename, $pub_id, $pub_type) = $db_query->fetchrow_array()) { + $pub_id_to_FBrf->{$pub_id}->{'FBrf'} = $uniquename; + $pub_id_to_FBrf->{$pub_id}->{'type'} = $pub_type; +} + #close $json_output_file; #close $data_error_file; From b87af378f4fd1183b3ef9e00bcf8615c6271d4fe Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 20 Jan 2026 17:40:50 +0000 Subject: [PATCH 06/43] first attempt at mapping hash --- populate_workflow_status.pl | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index e0919cb..7f1b56e 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -179,12 +179,32 @@ =head1 DESCRIPTION -# add mapping hash here - my $workflow_tag_mapping = { + # community curation + '0_user' => { + 'finished_status' => 'ATP:0000234', # community curation finished + 'relevant_currec' => ['user'], + + }, + + # first pass curation + '1_skim' => { + 'finished_status' => 'ATP:0000330', # first pass curation finished + 'relevant_currec' => ['skim'], + + }, + # manual indexing + '2_manual_indexing' => { + 'finished_status' => 'ATP:0000275', # manual indexing complete + 'relevant_currec' => ['thin', 'cam_full', 'gene_full', 'cam_no_suffix'], # use an array so can go through in this order when assigning manual indexing status + 'nocur_override' => 'ATP:0000343', # won't manually index + 'pubtype_filter' => { + 'cam_no_suffix' => ['review'], + }, + }, }; From f7556e76e9e656eba11302676ebc5cc284ae2463 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 20 Jan 2026 17:52:45 +0000 Subject: [PATCH 07/43] curation status data now in single reference, adding missing mapping --- lib/AuditTable.pm | 1 + populate_workflow_status.pl | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index f56d032..a2f7c4d 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -503,6 +503,7 @@ Arguments: 'user' => '[a-z][a-z][0-9]{1,}\.user', 'skim' => '.+?\.(skim|skim_multiple|skim_multi)(\..+)?', 'gene_full' => '[a-z][a-z][0-9]{1,}\.full', # publications that have had full manual indexing (listing) of genes, and genes are the only entity type relevant to manual indexing in the publication + 'cam_no_suffix' => '(ma|sb|rd|sf|tj|kk|al|sm|cm|pm|gm|ao|cp|lp|sp|sr|rs|ra|ds|vt|ew|cy|pu|ga|ha|hb|rc|rf|pg|st|rz)[0-9]{1,}', # filename types used for both topic curation status AND workflow status 'cam_full' => '(ma|sb|rd|al|sm|cm|pm|gm|lp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?', # publications that should have had full curation of both any genetic reagent data AND any phenotype data diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 7f1b56e..799c2da 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -244,22 +244,27 @@ =head1 DESCRIPTION ## 1. get relevant data for deciding curation status for the various type of workflow_tag curation -# information for community curation status -my ($user_by_timestamp, $user_by_curator) = &get_relevant_currec_for_datatype($dbh,'user'); +my $fb_data = {}; -# information for first pass curation status -my ($skim_by_timestamp, $skim_by_curator) = &get_relevant_currec_for_datatype($dbh,'skim'); +foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) { -# information for manual indexing status -my ($thin_by_timestamp, $thin_by_curator) = &get_relevant_currec_for_datatype($dbh,'thin'); -my ($cam_full_by_timestamp, $cam_full_by_curator) = &get_relevant_currec_for_datatype($dbh,'cam_full'); -my ($gene_full_by_timestamp, $gene_full_by_curator) = &get_relevant_currec_for_datatype($dbh,'gene_full'); + foreach my $relevant_currec (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_currec'}}) { + + ($fb_data->{"$relevant_currec"}->{"by_timestamp"}, $fb_data->{"$relevant_currec"}->{"by_curator"}) = &get_relevant_currec_for_datatype($dbh,$relevant_currec); + + } +} + + +#print Dumper ($fb_data); + +# 2. get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur$'); # get publications that *do* have links to genetic objects (used for validation) my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data'); -## 2. Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl) +## 3. Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl) my $pub_id_to_FBrf = {}; my $sql_query = sprintf("select p.uniquename, p.pub_id, cvt.name from pub p, cvterm cvt where p.is_obsolete = 'f' and p.type_id = cvt.cvterm_id and cvt.is_obsolete = '0' and cvt.name in ('paper', 'erratum', 'letter', 'note', 'teaching note', 'supplementary material', 'retraction', 'personal communication to FlyBase', 'review') and p.uniquename ~'%s'", $test_FBrf); From ae9ca72446a74f12fac6bbd59460aec95f4798f5 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Wed, 21 Jan 2026 11:55:55 +0000 Subject: [PATCH 08/43] First outline of script: adds workflow_tag information for different curation record types --- populate_workflow_status.pl | 190 ++++++++++++++++++++++++++++++++++-- 1 file changed, 184 insertions(+), 6 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 799c2da..6062b7a 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -184,24 +184,26 @@ =head1 DESCRIPTION # community curation '0_user' => { 'finished_status' => 'ATP:0000234', # community curation finished - 'relevant_currec' => ['user'], + 'relevant_record_type' => ['user'], }, # first pass curation '1_skim' => { 'finished_status' => 'ATP:0000330', # first pass curation finished - 'relevant_currec' => ['skim'], + 'relevant_record_type' => ['skim'], }, # manual indexing '2_manual_indexing' => { 'finished_status' => 'ATP:0000275', # manual indexing complete - 'relevant_currec' => ['thin', 'cam_full', 'gene_full', 'cam_no_suffix'], # use an array so can go through in this order when assigning manual indexing status + 'relevant_record_type' => ['thin', 'cam_full', 'gene_full', 'cam_no_suffix'], # use an array so can go through the types in this order when assigning manual indexing status 'nocur_override' => 'ATP:0000343', # won't manually index 'pubtype_filter' => { - 'cam_no_suffix' => ['review'], + 'cam_no_suffix' => { + 'review' => '1', + }, }, }, @@ -248,9 +250,9 @@ =head1 DESCRIPTION foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) { - foreach my $relevant_currec (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_currec'}}) { + foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) { - ($fb_data->{"$relevant_currec"}->{"by_timestamp"}, $fb_data->{"$relevant_currec"}->{"by_curator"}) = &get_relevant_currec_for_datatype($dbh,$relevant_currec); + ($fb_data->{"$relevant_record_type"}->{"by_timestamp"}, $fb_data->{"$relevant_record_type"}->{"by_curator"}) = &get_relevant_currec_for_datatype($dbh,$relevant_record_type); } } @@ -276,6 +278,104 @@ =head1 DESCRIPTION } +my $complete_data = {}; + +foreach my $pub_id (sort keys %{$pub_id_to_FBrf}) { + + my $FBrf = $pub_id_to_FBrf->{$pub_id}->{'FBrf'}; + my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'}; + + # maybe do nocur validation and record status - need to make new subroutine +# my ($nocur_status, $nocur_timestamp, $note) = &check_for_nocur($pub_id); + my ($nocur_status, $nocur_timestamp, $note) = ''; + + + # switch for tracking whether have set the status of each workflow type already + my $switch = { + + '0_user' => 0, + '1_skim' => 0, + '2_manual_indexing' => 0, + + + }; + + foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) { + + + foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) { + + if (exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'} && exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'}->{$relevant_record_type}) { + + #my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'}; + + unless (exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'}->{$relevant_record_type}->{$pub_type}) { + + next; + + } + } + + unless ($switch->{"$workflow_type"}) { + + # make new subroutine to get relevant curator and timestamp from $fb_data + my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}->{"by_timestamp"}, $fb_data->{"$relevant_record_type"}->{"by_curator"}, $pub_id); + + # if there is a matching record for the workflow type, make a json structure for submitting data to the Alliance + if (defined $curator_details) { + + + my $ATP = $workflow_tag_mapping->{$workflow_type}->{'finished_status'}; + my $curation_tag = ''; + my $note = ''; + + # for manual indexing, use the nocur information to override the workflow type (to the 'won't curate' style term) and add the appropriate curation_tag + if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'} && $nocur_status == 1) { + $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; + $curation_tag = "ATP:0000207"; # no genetic data + } + + # build reference with information for this publication+workflow type combination + my $data = {}; + + my $FBrf_with_prefix="FB:".$FBrf; + $data->{date_created} = $curator_details->{timestamp}; + $data->{date_updated} = $curator_details->{timestamp}; + $data->{created_by} = $curator_details->{curator}; + $data->{updated_by} = $curator_details->{curator}; + $data->{mod_abbreviation} = "FB"; + $data->{reference_curie} = $FBrf_with_prefix; + $data->{workflow_tag_id} = $ATP; + $data->{note} = "for debugging: pub type: $pub_type, record type: $relevant_record_type, curation record $curator_details->{currecs}"; + + if ($curation_tag) { + $data->{curation_tag} = $curation_tag; + } + + if ($note) { + $data->{curation_tag} = $note; + } + + push @{$complete_data->{data}}, $data; + + # set the switch to indicate have set the status for this particular workflow type + $switch->{"$workflow_type"}++; + + #print STDERR "$FBrf, $relevant_record_type, $workflow_type, $curator_details->{currecs}: switch value: $switch->{$workflow_type}\n"; + + } + + } + } + + } + + +} + + +print Dumper ($complete_data); + #close $json_output_file; #close $data_error_file; #close $process_error_file; @@ -283,3 +383,81 @@ =head1 DESCRIPTION print STDERR "##Ended processing: " . (scalar localtime) . "\n"; + + +############################## + + +sub get_relevant_curator_from_candidate_list { + + unless (@_ == 3) { + + die "Wrong number of parameters passed to the get_relevant_curator_from_candidate_list subroutine\n"; + } + + + my ($data_by_timestamp, $data_by_curator, $pub_id) = @_; + + my $curator_details = undef; + + + if (exists $data_by_timestamp->{$pub_id}) { + + + # get the timestamp of the *earliest* matching curation record + my $timestamp = $data_by_timestamp->{$pub_id}[0]; + + # try to determine the relevant curator if appropriate for the topic + my $relevant_curator = ''; + my $relevant_record_types = ''; + + + if (exists $data_by_curator->{$pub_id}) { + + if (scalar keys %{$data_by_curator->{$pub_id}} == 1) { + my $curator_candidate = join '', keys %{$data_by_curator->{$pub_id}}; + + if (exists $data_by_curator->{$pub_id}->{$curator_candidate}->{$timestamp}) { + $relevant_curator = $curator_candidate; + $relevant_record_types = join ' ', sort keys %{$data_by_curator->{$pub_id}->{$curator_candidate}->{$timestamp}}; + + } + } else { + + my $count = 0; + foreach my $curator_candidate (sort keys %{$data_by_curator->{$pub_id}}) { + + foreach my $candidate_timestamp (sort keys %{$data_by_curator->{$pub_id}->{$curator_candidate}}) { + + if ($candidate_timestamp eq $timestamp) { + + $relevant_curator = $curator_candidate; + $relevant_record_types = join ' ', sort keys %{$data_by_curator->{$pub_id}->{$curator_candidate}->{$candidate_timestamp}}; + $count++; + + } + } + } + + unless ($count == 1) { + $relevant_curator = ''; + $relevant_record_types = ''; + + } + } + } + + if ($relevant_curator) { + + $curator_details->{curator} = $relevant_curator; + $curator_details->{timestamp} = $timestamp; + $curator_details->{currecs} = $relevant_record_types; # useful for debugging + + } + + + } + + return $curator_details; +} + From eedf805bf80b4fa02d7b9ac90106346c08c85fae Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Wed, 21 Jan 2026 15:49:10 +0000 Subject: [PATCH 09/43] adding basic nocur logic --- populate_workflow_status.pl | 80 +++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 8 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 6062b7a..82d8742 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -261,7 +261,7 @@ =head1 DESCRIPTION #print Dumper ($fb_data); # 2. get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term -my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur$'); +my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur|nocur_abs$'); # get publications that *do* have links to genetic objects (used for validation) my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data'); @@ -286,8 +286,8 @@ =head1 DESCRIPTION my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'}; # maybe do nocur validation and record status - need to make new subroutine -# my ($nocur_status, $nocur_timestamp, $note) = &check_for_nocur($pub_id); - my ($nocur_status, $nocur_timestamp, $note) = ''; + my ($nocur_status, $nocur_timestamp, $nocur_note) = &check_and_validate_nocur($nocur_flags, $has_genetic_data, $pub_id); +# my ($nocur_status, $nocur_timestamp, $nocur_note) = ''; # switch for tracking whether have set the status of each workflow type already @@ -330,9 +330,20 @@ =head1 DESCRIPTION my $note = ''; # for manual indexing, use the nocur information to override the workflow type (to the 'won't curate' style term) and add the appropriate curation_tag - if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'} && $nocur_status == 1) { - $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; - $curation_tag = "ATP:0000207"; # no genetic data + if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { + $note = "$nocur_note"; + + if ($nocur_status == 1) { + $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; + $curation_tag = "ATP:0000207"; # no genetic data + unless ($curator_details->{timestamp} eq $nocur_timestamp) { + + $note = $note . " timestamp mis-match: curator: $curator_details->{timestamp}, nocur: $nocur_timestamp"; + + } + + } + } # build reference with information for this publication+workflow type combination @@ -346,16 +357,21 @@ =head1 DESCRIPTION $data->{mod_abbreviation} = "FB"; $data->{reference_curie} = $FBrf_with_prefix; $data->{workflow_tag_id} = $ATP; - $data->{note} = "for debugging: pub type: $pub_type, record type: $relevant_record_type, curation record $curator_details->{currecs}"; if ($curation_tag) { $data->{curation_tag} = $curation_tag; } + # for debugging + $note = $note . " for debugging: $FBrf, pub type: $pub_type, record type: $relevant_record_type, curation record $curator_details->{currecs}"; + $note =~ s/^ //; + if ($note) { - $data->{curation_tag} = $note; + $data->{note} = $note; } + + push @{$complete_data->{data}}, $data; # set the switch to indicate have set the status for this particular workflow type @@ -461,3 +477,51 @@ sub get_relevant_curator_from_candidate_list { return $curator_details; } + +sub check_and_validate_nocur { + + unless (@_ == 3) { + + die "Wrong number of parameters passed to the check_and_validate_nocur subroutine\n"; + } + + my ($nocur_flags, $has_genetic_data, $pub_id) = @_; + + + my $nocur_status = 0; + my $nocur_timestamp = ''; + my $note = ''; + + if (exists $nocur_flags->{$pub_id}) { + + # if there is curated genetic data then ignore any nocur flag as it must be an error + unless (exists $has_genetic_data->{$pub_id}) { + $nocur_status = 1; + } + + if (exists $nocur_flags->{$pub_id}->{nocur}) { + $nocur_timestamp = $nocur_flags->{$pub_id}->{nocur}[0]; + + } else { + + $nocur_timestamp = $nocur_flags->{$pub_id}->{nocur_abs}[0]; + + $note = "Only looked at abstract."; + } + + + } else { + + + unless (exists $has_genetic_data->{$pub_id}) { + + $note = "WARNING: publication has no curated genetic data, but is missing a 'nocur' flag"; + + } + + } + + return ($nocur_status, $nocur_timestamp, $note); + + +} From a0d05c083a73ce40105551b0b4e4fa3738217ac3 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Wed, 21 Jan 2026 16:36:23 +0000 Subject: [PATCH 10/43] add doc for get_relevant_curator_from_candidate_list subroutine, cope with edge case (multiple curators for same timestamp) --- populate_workflow_status.pl | 56 ++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 82d8742..7258012 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -185,7 +185,7 @@ =head1 DESCRIPTION '0_user' => { 'finished_status' => 'ATP:0000234', # community curation finished 'relevant_record_type' => ['user'], - + 'community_curation' => '1', }, # first pass curation @@ -329,7 +329,20 @@ =head1 DESCRIPTION my $curation_tag = ''; my $note = ''; - # for manual indexing, use the nocur information to override the workflow type (to the 'won't curate' style term) and add the appropriate curation_tag + # overide the curator for edge cases where the curator has been set to 'FB_curator' + # due to multiple records of the same type being submitted in the same week and it + # is a community curation record. + # set back to 'User Submission' (the more general curator for community curation). + if (exists $workflow_tag_mapping->{$workflow_type}->{'community_curation'}) { + + if ($curator_details->{curator} eq 'FB_curator') { + $curator_details->{curator} = 'User Submission'; + + } + + } + + # for manual indexing, use the nocur information to override the workflow type (to the 'won't curate' style term) and add the appropriate curation_tag where appropriate if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { $note = "$nocur_note"; @@ -406,6 +419,40 @@ =head1 DESCRIPTION sub get_relevant_curator_from_candidate_list { + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_relevant_curator_from_candidate_list + Usage: get_relevant_curator_from_candidate_list(candidate_list(by_timestamp),candidate_list(by_curator),pub_id); + Function: Gets curator details for a pub_id specified in the third argument, using the candidate_lists provided in the first and second arguments. + Example: my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}->{"by_timestamp"}, $fb_data->{"$relevant_record_type"}->{"by_curator"}, $pub_id); + +Arguments: + +candidate_list(by_timestamp) and candidate_list(by_curator) are the two data structures returned by the get_relevant_currec_for_datatype subroutine (in AuditTable.pm). They list timestamp, curator name and curation record information for curation records of a particular type, keyed on pub_ids. + +pub_id is the pub_id of a single reference. + +Returns: + + +- if the pub_id is NOT a key in candidate_list(by_timestamp), returns undef (i.e. there is NO curation of this particular curation type for this pub_id) + +- if the pub_id IS a key in candidate_list(by_timestamp), returns information for the *earliest* curation record from the candidate lists + + + o $curator_details->{curator} = curator name + o $curator_details->{timestamp} = timestamp + o $curator_details->{currecs} = curation record filename - useful for debugging + +If there is more than one curator for the *earliest* timestamp (i.e. if multiple curation records of the same type of curation were submitted for the same pub_id in the same data load) then curator is set to 'FB_curator' and currecs is set to 'multiple curators for timestamp' + +=cut + + unless (@_ == 3) { die "Wrong number of parameters passed to the get_relevant_curator_from_candidate_list subroutine\n"; @@ -417,6 +464,7 @@ sub get_relevant_curator_from_candidate_list { my $curator_details = undef; + if (exists $data_by_timestamp->{$pub_id}) { @@ -456,8 +504,8 @@ sub get_relevant_curator_from_candidate_list { } unless ($count == 1) { - $relevant_curator = ''; - $relevant_record_types = ''; + $relevant_curator = 'FB_curator'; + $relevant_record_types = 'multiple curators for timestamp'; } } From 55749120b65d3fdf89f299737011a8504d36ed77 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Wed, 21 Jan 2026 17:05:50 +0000 Subject: [PATCH 11/43] documentation for check_and_validate_nocur --- populate_workflow_status.pl | 39 +++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 7258012..0572185 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -528,6 +528,45 @@ =head1 SUBROUTINE: sub check_and_validate_nocur { +=head1 SUBROUTINE: +=cut + +=head1 + + Title: check_and_validate_nocur + Function: Checks whether a publication has been marked as a 'nocur' (contains no genetic data), and validates whether that flagging is consistent with the types of entity attached to the publication in FlyBase. Returns information based on this checking and validation. + Example: my ($nocur_status, $nocur_timestamp, $nocur_note) = &check_and_validate_nocur($nocur_flags, $has_genetic_data, $pub_id); + +Arguments: + +o $nocur_flags - hash reference containing nocur/nocur_abs flag information + +o $has_genetic_data - list of all publications that have genetic entities attached to them. + +o $pub_id is the pub_id of a single reference. + +Returns: + + +o $nocur_status: boolean that indicates whether (1) or not (0) the publication is a 'nocur' (contains no genetic data). + + o Note: if a publication has a nocur/nocur_abs flag in FlyBase but there ARE genetic entities attached to the publication, returns $nocur_status = 0 (i.e. the nocur/nocur_abs flag is ignored as it must be an error). + + +o $nocur_timestamp: timestamp of when the 'nocur' flag was added. + +o $nocur_note: free text note that can be added in the 'note' slot of the relevant ATP workflow_tag item. + + o contains 'Only looked at abstract.' for nocur publications ($nocur_status = 1) with the relevant flag (nocur_abs) in FlyBase. + + o contains 'WARNING: publication has no curated genetic data, but is missing a 'nocur' flag' for publications where there are no genetic entities attached to the publication, but the expected 'nocur'/'nocur_abs' flag is missing. These publications have $nocur_status = 0 as there may have been genetic data entities that were within scope at the time of curation, but are no longer in scope, so cannot assume that 'no genetic data' is appropriate. + + + +=cut + + + unless (@_ == 3) { die "Wrong number of parameters passed to the check_and_validate_nocur subroutine\n"; From 32d4c228bfeb904183327aa153a34dc4de11d9eb Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Wed, 21 Jan 2026 18:41:46 +0000 Subject: [PATCH 12/43] adding additional information that can be added via nocur flag (when manual indexing not already set) --- populate_workflow_status.pl | 65 ++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 9 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 0572185..5546aa3 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -328,6 +328,9 @@ =head1 DESCRIPTION my $ATP = $workflow_tag_mapping->{$workflow_type}->{'finished_status'}; my $curation_tag = ''; my $note = ''; + # set values based on matching record (overriden in a few edge cases below) + my $curator = "$curator_details->{curator}"; + my $timestamp = "$curator_details->{timestamp}"; # overide the curator for edge cases where the curator has been set to 'FB_curator' # due to multiple records of the same type being submitted in the same week and it @@ -335,8 +338,8 @@ =head1 DESCRIPTION # set back to 'User Submission' (the more general curator for community curation). if (exists $workflow_tag_mapping->{$workflow_type}->{'community_curation'}) { - if ($curator_details->{curator} eq 'FB_curator') { - $curator_details->{curator} = 'User Submission'; + if ($curator eq 'FB_curator') { + $curator = 'User Submission'; } @@ -349,9 +352,9 @@ =head1 DESCRIPTION if ($nocur_status == 1) { $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; $curation_tag = "ATP:0000207"; # no genetic data - unless ($curator_details->{timestamp} eq $nocur_timestamp) { - - $note = $note . " timestamp mis-match: curator: $curator_details->{timestamp}, nocur: $nocur_timestamp"; + unless ($timestamp eq $nocur_timestamp) { + $timestamp = "$nocur_timestamp"; + $curator = 'FB_curator'; } @@ -363,10 +366,10 @@ =head1 DESCRIPTION my $data = {}; my $FBrf_with_prefix="FB:".$FBrf; - $data->{date_created} = $curator_details->{timestamp}; - $data->{date_updated} = $curator_details->{timestamp}; - $data->{created_by} = $curator_details->{curator}; - $data->{updated_by} = $curator_details->{curator}; + $data->{date_created} = $timestamp; + $data->{date_updated} = $timestamp; + $data->{created_by} = $curator; + $data->{updated_by} = $curator; $data->{mod_abbreviation} = "FB"; $data->{reference_curie} = $FBrf_with_prefix; $data->{workflow_tag_id} = $ATP; @@ -397,6 +400,50 @@ =head1 DESCRIPTION } } + # once been through all the curation record types for each worfklow_type, see if there is additional information that can be added via nocur flag + if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { + + unless ($switch->{"$workflow_type"}) { + + if ($nocur_status == 1) { + my $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; + my $curation_tag = "ATP:0000207"; # no genetic data + my $note = "$nocur_note"; + + # build reference with information for this publication+workflow type combination + my $data = {}; + + my $FBrf_with_prefix="FB:".$FBrf; + $data->{date_created} = $nocur_timestamp; + $data->{date_updated} = $nocur_timestamp; + $data->{created_by} = 'FB_curator'; + $data->{updated_by} = 'FB_curator'; + $data->{mod_abbreviation} = "FB"; + $data->{reference_curie} = $FBrf_with_prefix; + $data->{workflow_tag_id} = $ATP; + + if ($curation_tag) { + $data->{curation_tag} = $curation_tag; + } + + # for debugging + $note = $note . " for debugging: $FBrf, pub type: $pub_type, nocur added via flag, timestamp: $nocur_timestamp"; + $note =~ s/^ //; + + if ($note) { + $data->{note} = $note; + } + + push @{$complete_data->{data}}, $data; + + # set the switch to indicate have set the status for this particular workflow type - not sure need this + $switch->{"$workflow_type"}++; + } + } + + } + + } From daa3cac5a953f745de6340203c251eb09bc137d0 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Thu, 22 Jan 2026 14:56:26 +0000 Subject: [PATCH 13/43] changes to get_relevant_curator_from_candidate_list: simpler input, proper assignment of curator when multiple possibilites --- populate_workflow_status.pl | 112 +++++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 41 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 5546aa3..ba4200d 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -185,7 +185,6 @@ =head1 DESCRIPTION '0_user' => { 'finished_status' => 'ATP:0000234', # community curation finished 'relevant_record_type' => ['user'], - 'community_curation' => '1', }, # first pass curation @@ -260,6 +259,7 @@ =head1 DESCRIPTION #print Dumper ($fb_data); + # 2. get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur|nocur_abs$'); @@ -319,7 +319,7 @@ =head1 DESCRIPTION unless ($switch->{"$workflow_type"}) { # make new subroutine to get relevant curator and timestamp from $fb_data - my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}->{"by_timestamp"}, $fb_data->{"$relevant_record_type"}->{"by_curator"}, $pub_id); + my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id); # if there is a matching record for the workflow type, make a json structure for submitting data to the Alliance if (defined $curator_details) { @@ -332,19 +332,6 @@ =head1 DESCRIPTION my $curator = "$curator_details->{curator}"; my $timestamp = "$curator_details->{timestamp}"; - # overide the curator for edge cases where the curator has been set to 'FB_curator' - # due to multiple records of the same type being submitted in the same week and it - # is a community curation record. - # set back to 'User Submission' (the more general curator for community curation). - if (exists $workflow_tag_mapping->{$workflow_type}->{'community_curation'}) { - - if ($curator eq 'FB_curator') { - $curator = 'User Submission'; - - } - - } - # for manual indexing, use the nocur information to override the workflow type (to the 'won't curate' style term) and add the appropriate curation_tag where appropriate if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { $note = "$nocur_note"; @@ -473,77 +460,110 @@ =head1 SUBROUTINE: =head1 Title: get_relevant_curator_from_candidate_list - Usage: get_relevant_curator_from_candidate_list(candidate_list(by_timestamp),candidate_list(by_curator),pub_id); - Function: Gets curator details for a pub_id specified in the third argument, using the candidate_lists provided in the first and second arguments. - Example: my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}->{"by_timestamp"}, $fb_data->{"$relevant_record_type"}->{"by_curator"}, $pub_id); + Usage: get_relevant_curator_from_candidate_list(candidate_list,pub_id); + Function: Gets relevant curator details for a pub_id specified in the second argument, from the candidate_list provided in the first argument. + Example: my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id); Arguments: -candidate_list(by_timestamp) and candidate_list(by_curator) are the two data structures returned by the get_relevant_currec_for_datatype subroutine (in AuditTable.pm). They list timestamp, curator name and curation record information for curation records of a particular type, keyed on pub_ids. +candidate_list contains timestamp, curator name and curation record information for curation records of a particular type, keyed on pub_ids. + +candidate_list needs to be a data structure with the following general format, which can be generated by using the get_relevant_currec_for_datatype subroutine (this returns two references that can first be put into the candidate_list->{"by_timestamp"} and candidate_list->{"by_curator"} halves of the data structure before it is passed to get_relevant_curator_from_candidate_list. + +o $candidate_list->{"by_timestamp"}->{pub_id} + + o each pub_id refers to an array of the timestamps associated with the pub_id for the curation records, ordered by date. + +o $candidate_list->{"by_curator"}->{pub_id}->{curator}->{timestamp}->{record_number} + + pub_id is the pub_id of a single reference. Returns: -- if the pub_id is NOT a key in candidate_list(by_timestamp), returns undef (i.e. there is NO curation of this particular curation type for this pub_id) +o if the pub_id is NOT a key in $candidate_list->{"by_timestamp"}, returns undef (i.e. there is NO curation of this particular curation type for this pub_id). + +o if the pub_id IS a key in $candidate_list->{"by_timestamp"}, returns information for the *earliest* curation record from the candidate list (getting the relevant details from $candidate_list->{"by_curator"}). + + + o $curator_details->{curator} = curator name + o $curator_details->{timestamp} = timestamp + o $curator_details->{currecs} = curation record filename - useful for debugging -- if the pub_id IS a key in candidate_list(by_timestamp), returns information for the *earliest* curation record from the candidate lists +If there is more than one 'relevant' curator for the *earliest* timestamp (i.e. if multiple curation records of the same type of curation were submitted for the same pub_id in the same data load) then: +o curator is set as follows: - o $curator_details->{curator} = curator name - o $curator_details->{timestamp} = timestamp - o $curator_details->{currecs} = curation record filename - useful for debugging + o if any of the relevant curators are FB curators - set to 'FB_curator' + + o otherwise, if any of the relevant curators are community curation - set to 'User Submission' + + o otherwise - set to 'ERROR:unable to reconcile curator' so can easily be identified as an error. + +o currecs is set to 'multiple curators for same timestamp' -If there is more than one curator for the *earliest* timestamp (i.e. if multiple curation records of the same type of curation were submitted for the same pub_id in the same data load) then curator is set to 'FB_curator' and currecs is set to 'multiple curators for timestamp' =cut - unless (@_ == 3) { + unless (@_ == 2) { die "Wrong number of parameters passed to the get_relevant_curator_from_candidate_list subroutine\n"; } - my ($data_by_timestamp, $data_by_curator, $pub_id) = @_; + my ($data, $pub_id) = @_; my $curator_details = undef; - if (exists $data_by_timestamp->{$pub_id}) { + if (exists $data->{"by_timestamp"}->{$pub_id}) { # get the timestamp of the *earliest* matching curation record - my $timestamp = $data_by_timestamp->{$pub_id}[0]; + my $timestamp = $data->{"by_timestamp"}->{$pub_id}[0]; # try to determine the relevant curator if appropriate for the topic my $relevant_curator = ''; - my $relevant_record_types = ''; + my $relevant_records = ''; - if (exists $data_by_curator->{$pub_id}) { + if (exists $data->{"by_curator"}->{$pub_id}) { - if (scalar keys %{$data_by_curator->{$pub_id}} == 1) { - my $curator_candidate = join '', keys %{$data_by_curator->{$pub_id}}; + if (scalar keys %{$data->{"by_curator"}->{$pub_id}} == 1) { + my $curator_candidate = join '', keys %{$data->{"by_curator"}->{$pub_id}}; - if (exists $data_by_curator->{$pub_id}->{$curator_candidate}->{$timestamp}) { + if (exists $data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$timestamp}) { $relevant_curator = $curator_candidate; - $relevant_record_types = join ' ', sort keys %{$data_by_curator->{$pub_id}->{$curator_candidate}->{$timestamp}}; + $relevant_records = join ' ', sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$timestamp}}; } } else { my $count = 0; - foreach my $curator_candidate (sort keys %{$data_by_curator->{$pub_id}}) { + my $curator_count = 0; + my $community_curation_count = 0; + foreach my $curator_candidate (sort keys %{$data->{"by_curator"}->{$pub_id}}) { - foreach my $candidate_timestamp (sort keys %{$data_by_curator->{$pub_id}->{$curator_candidate}}) { + foreach my $candidate_timestamp (sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}}) { if ($candidate_timestamp eq $timestamp) { $relevant_curator = $curator_candidate; - $relevant_record_types = join ' ', sort keys %{$data_by_curator->{$pub_id}->{$curator_candidate}->{$candidate_timestamp}}; + $relevant_records = join ' ', sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$candidate_timestamp}}; + + unless ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission' || $relevant_curator eq 'UniProtKB') { + + $curator_count++; + } + + if ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission') { + + $community_curation_count++; + } $count++; } @@ -551,8 +571,18 @@ =head1 SUBROUTINE: } unless ($count == 1) { - $relevant_curator = 'FB_curator'; - $relevant_record_types = 'multiple curators for timestamp'; + if ($curator_count) { + $relevant_curator = 'FB_curator'; + + } elsif ($community_curation_count) { + + $relevant_curator = 'User Submission'; + + } else { + + $relevant_curator = 'ERROR:unable to reconcile curator'; + } + $relevant_records = 'multiple curators for same timestamp'; } } @@ -562,7 +592,7 @@ =head1 SUBROUTINE: $curator_details->{curator} = $relevant_curator; $curator_details->{timestamp} = $timestamp; - $curator_details->{currecs} = $relevant_record_types; # useful for debugging + $curator_details->{currecs} = $relevant_records; # useful for debugging } From e263e53d3190a2ec41ce2986e6f662d9cc848283 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 11:14:10 +0000 Subject: [PATCH 14/43] faster way to identify matching curator for a pub_id+timestamp combo: subs get_all_currec_data and get_relevant_curator_from_candidate_list_using_pub_and_timestamp --- lib/AuditTable.pm | 72 +++++++++++++++++++++++++- lib/Util.pm | 128 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 198 insertions(+), 2 deletions(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index a2f7c4d..cccf152 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -5,7 +5,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); -our @EXPORT = qw(get_relevant_curator get_flag_info_with_audit_data get_timestamps_for_flag_with_suffix get_timestamps_for_flaglist_with_suffix get_timestamps_for_pubprop_value get_relevant_currec_for_datatype get_matching_pubprop_value_with_timestamps get_all_flag_info_with_timestamps); +our @EXPORT = qw(get_relevant_curator get_all_currec_data get_flag_info_with_audit_data get_timestamps_for_flag_with_suffix get_timestamps_for_flaglist_with_suffix get_timestamps_for_pubprop_value get_relevant_currec_for_datatype get_matching_pubprop_value_with_timestamps get_all_flag_info_with_timestamps); =head1 MODULE: AuditTable @@ -137,6 +137,76 @@ sub get_relevant_curator { return ($data); } +sub get_all_currec_data { + + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_all_currec_data + Usage: get_all_currec_data(database_handle); + Function: Stores curator and corresponding curation_record_filename information for all 'curated_by' pubprops, keyed on pub_id. + Example: my $all_curation_record_data = &get_all_currec_data($dbh); + +Arguments: database handle + +Returns: data structure with the following format: + +$data->{$pub_id}->{$timestamp}->{$curator}->{$curation_record_filename} + + +The get_relevant_curator_from_candidate_list_using_pub_and_timestamp in lib/Util.pm can be used to filter the returned list for the curator (and corresponding curation_record_filename information) that matches a specific pub_id plus timestamp combination. + + +=cut + + + unless (@_ == 1) { + + die "Wrong number of parameters passed to the get_all_currec_data subroutine\n"; + } + + my ($dbh) = @_; + + my $data = {}; + + my $sql_query=sprintf("select distinct pp.pub_id, pp.value, ac.transaction_timestamp from pubprop pp, cvterm c, audit_chado ac where ac.audited_table='pubprop' and ac.audit_transaction='I' and pp.pubprop_id=ac.record_pkey and c.cvterm_id=pp.type_id and c.name = 'curated_by' order by ac.transaction_timestamp"); + + my $db_query = $dbh->prepare($sql_query); + $db_query->execute or die "WARNING: ERROR: Unable to execute get_all_currec_data query ($!)\n"; + + + while (my ($pub_id, $curated_by_value, $timestamp) = $db_query->fetchrow_array) { + + + if ($curated_by_value =~ m/^Curator: (.+?);Proforma: (.+?);timelastmodified: (.*)$/) { + + my $curator = $1; + my $curation_record_filename = $2; + my $timelastmodified = $3; + + # convert all unknown style curators to the same 'FB_curator' name that is used for persistent store submissions + if ($curator eq 'Unknown' || $curator eq 'Unknown Curator' || $curator eq 'Generic Curator') { + + $curator = 'FB_curator'; + + } + + $data->{$pub_id}->{$timestamp}->{$curator}->{$curation_record_filename}++; + + } else { + # not expecting to trip this error + print "ERROR: wrong curated_by pubprop format for pub_id: $pub_id, pubprop: $curated_by_value\n"; + } + } + + return $data; +} + + sub get_flag_info_with_audit_data { diff --git a/lib/Util.pm b/lib/Util.pm index f2dbc9a..c72d971 100644 --- a/lib/Util.pm +++ b/lib/Util.pm @@ -5,7 +5,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); -our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data); +our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data get_relevant_curator_from_candidate_list_using_pub_and_timestamp); use JSON::PP; @@ -226,3 +226,129 @@ o $data_type is the type of curated data. The value must be present as a key in } +sub get_relevant_curator_from_candidate_list_using_pub_and_timestamp { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_relevant_curator_from_candidate_list_using_pub_and_timestamp + Usage: get_relevant_curator_from_candidate_list_using_pub_and_timestamp(candidate_list, pub_id of reference, timestamp); + Function: Extracts curator and corresponding curation_record_filename information from the candidate_list specified in the first argument, where it matches the specified pub_id (second argument) plus timestamp (third argument) combination. + Example: my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $nocur_timestamp); + +Arguments: + +o candidate_list needs to be a data structure with the following general format (a data structure with the correct format that contains ALL curator related information can be generated by using the get_all_currec_data subroutine that is in lib/AuditTable.pm). + + o $candidate_list->{pub_id}->{timestamp}->{curator}->{curation_record_filename} + + +o pub_id is the pub_id of the single reference to be matched. + +o timestamp is the timestamp to be matched. + +Returns: + + +o if there is no matching curator information for the pub_id+timestamp combination, returns undef. + +o if there is a single matching curator for the pub_id+timestamp combination, returns the following: + + + o $curator_details->{curator} = curator name + o $curator_details->{currecs} = curation record filename(s) - useful for debugging + + +o If there is *more than one* matching curator for the pub_id+timestamp combination, then instead of a returning an individual curator name, a more generic curator name is returned if possible, using the following logic: + + o if any of the relevant curators are FB curators - $curator_details->{curator} is set to 'FB_curator' + + o otherwise, if any of the relevant curators represent community curation - $curator_details->{curator} is set to 'User Submission' (the more general 'curator' for community curation). + + o otherwise - $curator_details->{curator} is set to 'ERROR:unable to reconcile curator' so can easily be identified as a potential error when debugging. + + o In addition, $curator_details->{currecs} is set to 'multiple curators for same timestamp' to help with debugging. + + +=cut + + + unless (@_ == 3) { + + die "Wrong number of parameters passed to the get_relevant_curator_from_candidate_list_using_pub_and_timestamp subroutine\n"; + } + + + my ($data, $pub_id, $timestamp) = @_; + + my $curator_details = undef; + + if (exists $data->{$pub_id} && exists $data->{$pub_id}->{$timestamp}) { + + my $relevant_curator = ''; + my $relevant_records = ''; + + # single matching curator + if (scalar keys %{$data->{$pub_id}->{$timestamp}} == 1) { + $relevant_curator = join '', keys %{$data->{$pub_id}->{$timestamp}}; + $relevant_records = join ' ', sort keys %{$data->{$pub_id}->{$timestamp}->{$relevant_curator}}; + + # multiple matching curators + } else { + + my $fb_curator_count = 0; # count of number of matching FB curators + my $community_curation_count = 0; # count of number of matching types of community curation + foreach my $curator_candidate (sort keys %{$data->{$pub_id}->{$timestamp}}) { + + $relevant_curator = $curator_candidate; + $relevant_records = join ' ', sort keys %{$data->{$pub_id}->{$timestamp}->{$curator_candidate}}; + + unless ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission' || $relevant_curator eq 'UniProtKB') { + + $fb_curator_count++; + } + + if ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission') { + + $community_curation_count++; + } + + } + + if ($fb_curator_count) { + $relevant_curator = 'FB_curator'; + + } elsif ($community_curation_count) { + + $relevant_curator = 'User Submission'; + + } else { + + $relevant_curator = 'ERROR:unable to reconcile curator'; + } + $relevant_records = 'multiple curators for same timestamp'; + + } + + + if ($relevant_curator) { + + if ($relevant_curator eq 'P. Leyland') { + $relevant_curator = "FB_curator"; + + } + + + $curator_details->{curator} = $relevant_curator; + $curator_details->{currecs} = $relevant_records; # useful for debugging + + } + + + } + + return $curator_details; +} From a3833b8c1b31dc9d9b3b812f33a443ba9e4e75d2 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 11:19:27 +0000 Subject: [PATCH 15/43] adding nocur details to output --- populate_workflow_status.pl | 51 +++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index ba4200d..127287a 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -242,6 +242,8 @@ =head1 DESCRIPTION print STDERR "##Starting processing: " . (scalar localtime) . "\n"; +my $all_curation_record_data = &get_all_currec_data($dbh); + ## 1. get relevant data for deciding curation status for the various type of workflow_tag curation @@ -331,8 +333,10 @@ =head1 DESCRIPTION # set values based on matching record (overriden in a few edge cases below) my $curator = "$curator_details->{curator}"; my $timestamp = "$curator_details->{timestamp}"; + my $curation_records = "$curator_details->{currecs}"; - # for manual indexing, use the nocur information to override the workflow type (to the 'won't curate' style term) and add the appropriate curation_tag where appropriate + # for manual indexing, use any nocur information to override the workflow type (to the 'won't curate' style term) + # and add the appropriate 'no genetic data' curation_tag where appropriate if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { $note = "$nocur_note"; @@ -341,12 +345,28 @@ =head1 DESCRIPTION $curation_tag = "ATP:0000207"; # no genetic data unless ($timestamp eq $nocur_timestamp) { $timestamp = "$nocur_timestamp"; - $curator = 'FB_curator'; + $note = $note . "timestamp mismatch: curation record info overwritten by nocur flag info"; + + my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); + + if (defined $nocur_details) { + + $curator = "$nocur_details->{curator}"; + $curation_records = "$nocur_details->{currecs}"; + + } else { + $curator = 'FB_curator'; + $curation_records = "WARNING: unable to get curator details for nocur flag"; + + + } + } } + } # build reference with information for this publication+workflow type combination @@ -366,7 +386,7 @@ =head1 DESCRIPTION } # for debugging - $note = $note . " for debugging: $FBrf, pub type: $pub_type, record type: $relevant_record_type, curation record $curator_details->{currecs}"; + $note = $note . " for debugging: $FBrf, pub type: $pub_type, record type: $relevant_record_type, curation record $curation_records"; $note =~ s/^ //; if ($note) { @@ -393,18 +413,33 @@ =head1 DESCRIPTION unless ($switch->{"$workflow_type"}) { if ($nocur_status == 1) { + + my $curator = 'FB_curator'; # default that is overriden with more specific data later where possible + my $timestamp = "$nocur_timestamp"; + my $curation_records = 'WARNING: unable to get curator details for nocur flag'; # default that is overriden with more specific data later where possible my $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; my $curation_tag = "ATP:0000207"; # no genetic data my $note = "$nocur_note"; + # get curator details for the nocur flag and use to override defaults where possible + my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id,$timestamp); + + if (defined $nocur_details) { + + $curator = "$nocur_details->{curator}"; + $curation_records = "$nocur_details->{currecs}"; + + } + + # build reference with information for this publication+workflow type combination my $data = {}; my $FBrf_with_prefix="FB:".$FBrf; - $data->{date_created} = $nocur_timestamp; - $data->{date_updated} = $nocur_timestamp; - $data->{created_by} = 'FB_curator'; - $data->{updated_by} = 'FB_curator'; + $data->{date_created} = $timestamp; + $data->{date_updated} = $timestamp; + $data->{created_by} = $curator; + $data->{updated_by} = $curator; $data->{mod_abbreviation} = "FB"; $data->{reference_curie} = $FBrf_with_prefix; $data->{workflow_tag_id} = $ATP; @@ -414,7 +449,7 @@ =head1 DESCRIPTION } # for debugging - $note = $note . " for debugging: $FBrf, pub type: $pub_type, nocur added via flag, timestamp: $nocur_timestamp"; + $note = $note . " for debugging: $FBrf, pub type: $pub_type, nocur added via flag, curation record: $curation_records, timestamp: $nocur_timestamp"; $note =~ s/^ //; if ($note) { From 31192dd67147ea39320c85988736ad643cf6a319 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 11:50:06 +0000 Subject: [PATCH 16/43] start putting debugging output in separate list for printing --- populate_workflow_status.pl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 127287a..7de8469 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -281,6 +281,7 @@ =head1 DESCRIPTION my $complete_data = {}; +my @debugging_output; foreach my $pub_id (sort keys %{$pub_id_to_FBrf}) { @@ -397,6 +398,9 @@ =head1 DESCRIPTION push @{$complete_data->{data}}, $data; + my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note"; + push @debugging_output, $debugging_output; + # set the switch to indicate have set the status for this particular workflow type $switch->{"$workflow_type"}++; @@ -457,6 +461,8 @@ =head1 DESCRIPTION } push @{$complete_data->{data}}, $data; + my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\tvia flag\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note"; + push @debugging_output, $debugging_output; # set the switch to indicate have set the status for this particular workflow type - not sure need this $switch->{"$workflow_type"}++; @@ -471,6 +477,12 @@ =head1 DESCRIPTION } +foreach my $line (sort @debugging_output) { + + print "$line\n"; + + +} print Dumper ($complete_data); From e5e4a313bddd782bafe71d2ae25728e8280437fd Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 14:08:06 +0000 Subject: [PATCH 17/43] simplifying nocur checking and separating out debugging comments --- populate_workflow_status.pl | 46 +++++++++++-------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 7de8469..ed9fa68 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -310,7 +310,6 @@ =head1 DESCRIPTION if (exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'} && exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'}->{$relevant_record_type}) { - #my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'}; unless (exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'}->{$relevant_record_type}->{$pub_type}) { @@ -335,18 +334,20 @@ =head1 DESCRIPTION my $curator = "$curator_details->{curator}"; my $timestamp = "$curator_details->{timestamp}"; my $curation_records = "$curator_details->{currecs}"; + my $debugging_note = ''; # for manual indexing, use any nocur information to override the workflow type (to the 'won't curate' style term) # and add the appropriate 'no genetic data' curation_tag where appropriate if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { - $note = "$nocur_note"; if ($nocur_status == 1) { $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; $curation_tag = "ATP:0000207"; # no genetic data + $note = "$nocur_note"; + unless ($timestamp eq $nocur_timestamp) { $timestamp = "$nocur_timestamp"; - $note = $note . "timestamp mismatch: curation record info overwritten by nocur flag info"; + $debugging_note = "timestamp mismatch: curation record info overwritten by nocur flag info"; my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); @@ -359,7 +360,6 @@ =head1 DESCRIPTION $curator = 'FB_curator'; $curation_records = "WARNING: unable to get curator details for nocur flag"; - } @@ -386,9 +386,6 @@ =head1 DESCRIPTION $data->{curation_tag} = $curation_tag; } - # for debugging - $note = $note . " for debugging: $FBrf, pub type: $pub_type, record type: $relevant_record_type, curation record $curation_records"; - $note =~ s/^ //; if ($note) { $data->{note} = $note; @@ -398,13 +395,12 @@ =head1 DESCRIPTION push @{$complete_data->{data}}, $data; - my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note"; + my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note\t$debugging_note"; push @debugging_output, $debugging_output; # set the switch to indicate have set the status for this particular workflow type $switch->{"$workflow_type"}++; - #print STDERR "$FBrf, $relevant_record_type, $workflow_type, $curator_details->{currecs}: switch value: $switch->{$workflow_type}\n"; } @@ -424,6 +420,7 @@ =head1 DESCRIPTION my $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; my $curation_tag = "ATP:0000207"; # no genetic data my $note = "$nocur_note"; + my $debugging_note = ''; # get curator details for the nocur flag and use to override defaults where possible my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id,$timestamp); @@ -452,16 +449,13 @@ =head1 DESCRIPTION $data->{curation_tag} = $curation_tag; } - # for debugging - $note = $note . " for debugging: $FBrf, pub type: $pub_type, nocur added via flag, curation record: $curation_records, timestamp: $nocur_timestamp"; - $note =~ s/^ //; if ($note) { $data->{note} = $note; } push @{$complete_data->{data}}, $data; - my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\tvia flag\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note"; + my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\tvia flag\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note\t$debugging_note"; push @debugging_output, $debugging_output; # set the switch to indicate have set the status for this particular workflow type - not sure need this @@ -667,24 +661,21 @@ =head1 SUBROUTINE: o $has_genetic_data - list of all publications that have genetic entities attached to them. -o $pub_id is the pub_id of a single reference. +o $pub_id is the pub_id of the single publication to be checked. Returns: -o $nocur_status: boolean that indicates whether (1) or not (0) the publication is a 'nocur' (contains no genetic data). - - o Note: if a publication has a nocur/nocur_abs flag in FlyBase but there ARE genetic entities attached to the publication, returns $nocur_status = 0 (i.e. the nocur/nocur_abs flag is ignored as it must be an error). - +$nocur_status is a boolean. It is only set to 1 if the publication being checked IS a nocur (has a nocur/nocur_abs flag in FlyBase) AND passed the validation (there are NO genetic entities attached to it). -o $nocur_timestamp: timestamp of when the 'nocur' flag was added. +(i.e. if a publication has a nocur/nocur_abs flag in FlyBase but there ARE genetic entities attached to the publication, $nocur_status = 0). -o $nocur_note: free text note that can be added in the 'note' slot of the relevant ATP workflow_tag item. - o contains 'Only looked at abstract.' for nocur publications ($nocur_status = 1) with the relevant flag (nocur_abs) in FlyBase. +$nocur_timestamp: timestamp of when the nocur/nocur_abs flag was added (if the flag exists). - o contains 'WARNING: publication has no curated genetic data, but is missing a 'nocur' flag' for publications where there are no genetic entities attached to the publication, but the expected 'nocur'/'nocur_abs' flag is missing. These publications have $nocur_status = 0 as there may have been genetic data entities that were within scope at the time of curation, but are no longer in scope, so cannot assume that 'no genetic data' is appropriate. +$nocur_note: free text note that can be added in the 'note' slot of the relevant ATP workflow_tag item. + o contains 'Only looked at abstract.' for publications with a nocur_abs flag in FlyBase. =cut @@ -719,17 +710,6 @@ =head1 SUBROUTINE: $note = "Only looked at abstract."; } - - - } else { - - - unless (exists $has_genetic_data->{$pub_id}) { - - $note = "WARNING: publication has no curated genetic data, but is missing a 'nocur' flag"; - - } - } return ($nocur_status, $nocur_timestamp, $note); From 3f13beb21c0f70452a6b39aed7c22196aa186d02 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 14:22:25 +0000 Subject: [PATCH 18/43] moving check_and_validate_nocur to Util.pm' --- lib/Util.pm | 76 ++++++++++++++++++++++++++++++++++++- populate_workflow_status.pl | 71 ---------------------------------- 2 files changed, 75 insertions(+), 72 deletions(-) diff --git a/lib/Util.pm b/lib/Util.pm index c72d971..506d029 100644 --- a/lib/Util.pm +++ b/lib/Util.pm @@ -5,7 +5,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); -our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data get_relevant_curator_from_candidate_list_using_pub_and_timestamp); +our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data get_relevant_curator_from_candidate_list_using_pub_and_timestamp check_and_validate_nocur); use JSON::PP; @@ -352,3 +352,77 @@ o If there is *more than one* matching curator for the pub_id+timestamp combinat return $curator_details; } + + +sub check_and_validate_nocur { + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: check_and_validate_nocur + Function: Checks whether a publication has been marked as a 'nocur' (contains no genetic data), and validates whether that flagging is consistent with the types of entity attached to the publication in FlyBase. Returns information based on this checking and validation. + Example: my ($nocur_status, $nocur_timestamp, $nocur_note) = &check_and_validate_nocur($nocur_flags, $has_genetic_data, $pub_id); + +Arguments: + +o $nocur_flags - hash reference containing nocur/nocur_abs flag information + +o $has_genetic_data - list of all publications that have genetic entities attached to them. + +o $pub_id is the pub_id of the single publication to be checked. + +Returns: + + +$nocur_status is a boolean. It is only set to 1 if the publication being checked IS a nocur (has a nocur/nocur_abs flag in FlyBase) AND passed the validation (there are NO genetic entities attached to it). + +(i.e. if a publication has a nocur/nocur_abs flag in FlyBase but there ARE genetic entities attached to the publication, $nocur_status = 0). + + +$nocur_timestamp: timestamp of when the nocur/nocur_abs flag was added (if the flag exists). + +$nocur_note: free text note that can be added in the 'note' slot of the relevant ATP workflow_tag item. + + o contains 'Only looked at abstract.' for publications with a nocur_abs flag in FlyBase. + + +=cut + + + + unless (@_ == 3) { + + die "Wrong number of parameters passed to the check_and_validate_nocur subroutine\n"; + } + + my ($nocur_flags, $has_genetic_data, $pub_id) = @_; + + + my $nocur_status = 0; + my $nocur_timestamp = ''; + my $note = ''; + + if (exists $nocur_flags->{$pub_id}) { + + # if there is curated genetic data then ignore any nocur flag as it must be an error + unless (exists $has_genetic_data->{$pub_id}) { + $nocur_status = 1; + } + + if (exists $nocur_flags->{$pub_id}->{nocur}) { + $nocur_timestamp = $nocur_flags->{$pub_id}->{nocur}[0]; + + } else { + + $nocur_timestamp = $nocur_flags->{$pub_id}->{nocur_abs}[0]; + + $note = "Only looked at abstract."; + } + } + + return ($nocur_status, $nocur_timestamp, $note); + + +} diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index ed9fa68..142b083 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -644,75 +644,4 @@ =head1 SUBROUTINE: } -sub check_and_validate_nocur { -=head1 SUBROUTINE: -=cut - -=head1 - - Title: check_and_validate_nocur - Function: Checks whether a publication has been marked as a 'nocur' (contains no genetic data), and validates whether that flagging is consistent with the types of entity attached to the publication in FlyBase. Returns information based on this checking and validation. - Example: my ($nocur_status, $nocur_timestamp, $nocur_note) = &check_and_validate_nocur($nocur_flags, $has_genetic_data, $pub_id); - -Arguments: - -o $nocur_flags - hash reference containing nocur/nocur_abs flag information - -o $has_genetic_data - list of all publications that have genetic entities attached to them. - -o $pub_id is the pub_id of the single publication to be checked. - -Returns: - - -$nocur_status is a boolean. It is only set to 1 if the publication being checked IS a nocur (has a nocur/nocur_abs flag in FlyBase) AND passed the validation (there are NO genetic entities attached to it). - -(i.e. if a publication has a nocur/nocur_abs flag in FlyBase but there ARE genetic entities attached to the publication, $nocur_status = 0). - - -$nocur_timestamp: timestamp of when the nocur/nocur_abs flag was added (if the flag exists). - -$nocur_note: free text note that can be added in the 'note' slot of the relevant ATP workflow_tag item. - - o contains 'Only looked at abstract.' for publications with a nocur_abs flag in FlyBase. - - -=cut - - - - unless (@_ == 3) { - - die "Wrong number of parameters passed to the check_and_validate_nocur subroutine\n"; - } - - my ($nocur_flags, $has_genetic_data, $pub_id) = @_; - - - my $nocur_status = 0; - my $nocur_timestamp = ''; - my $note = ''; - - if (exists $nocur_flags->{$pub_id}) { - - # if there is curated genetic data then ignore any nocur flag as it must be an error - unless (exists $has_genetic_data->{$pub_id}) { - $nocur_status = 1; - } - - if (exists $nocur_flags->{$pub_id}->{nocur}) { - $nocur_timestamp = $nocur_flags->{$pub_id}->{nocur}[0]; - - } else { - - $nocur_timestamp = $nocur_flags->{$pub_id}->{nocur_abs}[0]; - - $note = "Only looked at abstract."; - } - } - - return ($nocur_status, $nocur_timestamp, $note); - - -} From 52a5cc093af230a546370fdcc23c34a3f789ecb0 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 14:36:23 +0000 Subject: [PATCH 19/43] moving get_relevant_curator_from_candidate_list to Util.pm and tidying dealing with unknown style curators --- lib/AuditTable.pm | 7 -- lib/Util.pm | 169 +++++++++++++++++++++++++++++++++++- populate_workflow_status.pl | 157 --------------------------------- 3 files changed, 166 insertions(+), 167 deletions(-) mode change 100644 => 100755 lib/Util.pm diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index cccf152..ef155a7 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -188,13 +188,6 @@ The get_relevant_curator_from_candidate_list_using_pub_and_timestamp in lib/Util my $curation_record_filename = $2; my $timelastmodified = $3; - # convert all unknown style curators to the same 'FB_curator' name that is used for persistent store submissions - if ($curator eq 'Unknown' || $curator eq 'Unknown Curator' || $curator eq 'Generic Curator') { - - $curator = 'FB_curator'; - - } - $data->{$pub_id}->{$timestamp}->{$curator}->{$curation_record_filename}++; } else { diff --git a/lib/Util.pm b/lib/Util.pm old mode 100644 new mode 100755 index 506d029..ce82b4c --- a/lib/Util.pm +++ b/lib/Util.pm @@ -5,7 +5,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); -our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data get_relevant_curator_from_candidate_list_using_pub_and_timestamp check_and_validate_nocur); +our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data get_relevant_curator_from_candidate_list get_relevant_curator_from_candidate_list_using_pub_and_timestamp check_and_validate_nocur); use JSON::PP; @@ -225,6 +225,166 @@ o $data_type is the type of curated data. The value must be present as a key in } +sub get_relevant_curator_from_candidate_list { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_relevant_curator_from_candidate_list + Usage: get_relevant_curator_from_candidate_list(candidate_list,pub_id); + Function: Gets relevant curator details for a pub_id specified in the second argument, from the candidate_list provided in the first argument. + Example: my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id); + +Arguments: + +candidate_list contains timestamp, curator name and curation record information for curation records of a particular type, keyed on pub_ids. + +candidate_list needs to be a data structure with the following general format, which can be generated by using the get_relevant_currec_for_datatype subroutine (this returns two references that can first be put into the candidate_list->{"by_timestamp"} and candidate_list->{"by_curator"} halves of the data structure before it is passed to get_relevant_curator_from_candidate_list. + +o $candidate_list->{"by_timestamp"}->{pub_id} + + o each pub_id refers to an array of the timestamps associated with the pub_id for the curation records, ordered by date. + +o $candidate_list->{"by_curator"}->{pub_id}->{curator}->{timestamp}->{record_number} + + + +pub_id is the pub_id of a single reference. + +Returns: + + +o if the pub_id is NOT a key in $candidate_list->{"by_timestamp"}, returns undef (i.e. there is NO curation of this particular curation type for this pub_id). + +o if the pub_id IS a key in $candidate_list->{"by_timestamp"}, returns information for the *earliest* curation record from the candidate list (getting the relevant details from $candidate_list->{"by_curator"}). + + + o $curator_details->{curator} = curator name + o $curator_details->{timestamp} = timestamp + o $curator_details->{currecs} = curation record filename - useful for debugging + +If there is more than one 'relevant' curator for the *earliest* timestamp (i.e. if multiple curation records of the same type of curation were submitted for the same pub_id in the same data load) then: + +o curator is set as follows: + + o if any of the relevant curators are FB curators - set to 'FB_curator' + + o otherwise, if any of the relevant curators are community curation - set to 'User Submission' + + o otherwise - set to 'ERROR:unable to reconcile curator' so can easily be identified as an error. + +o currecs is set to 'multiple curators for same timestamp' + + +=cut + + + unless (@_ == 2) { + + die "Wrong number of parameters passed to the get_relevant_curator_from_candidate_list subroutine\n"; + } + + + my ($data, $pub_id) = @_; + + my $curator_details = undef; + + + + if (exists $data->{"by_timestamp"}->{$pub_id}) { + + + # get the timestamp of the *earliest* matching curation record + my $timestamp = $data->{"by_timestamp"}->{$pub_id}[0]; + + # try to determine the relevant curator if appropriate for the topic + my $relevant_curator = ''; + my $relevant_records = ''; + + + if (exists $data->{"by_curator"}->{$pub_id}) { + + if (scalar keys %{$data->{"by_curator"}->{$pub_id}} == 1) { + my $curator_candidate = join '', keys %{$data->{"by_curator"}->{$pub_id}}; + + if (exists $data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$timestamp}) { + $relevant_curator = $curator_candidate; + $relevant_records = join ' ', sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$timestamp}}; + + } + } else { + + my $count = 0; + my $curator_count = 0; + my $community_curation_count = 0; + foreach my $curator_candidate (sort keys %{$data->{"by_curator"}->{$pub_id}}) { + + foreach my $candidate_timestamp (sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}}) { + + if ($candidate_timestamp eq $timestamp) { + + $relevant_curator = $curator_candidate; + $relevant_records = join ' ', sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$candidate_timestamp}}; + + unless ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission' || $relevant_curator eq 'UniProtKB') { + + $curator_count++; + } + + if ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission') { + + $community_curation_count++; + } + $count++; + + } + } + } + + unless ($count == 1) { + if ($curator_count) { + $relevant_curator = 'FB_curator'; + + } elsif ($community_curation_count) { + + $relevant_curator = 'User Submission'; + + } else { + + $relevant_curator = 'ERROR:unable to reconcile curator'; + } + $relevant_records = 'multiple curators for same timestamp'; + + } + } + } + + if ($relevant_curator) { + + # convert all unknown style curators to the same 'FB_curator' name that is used for persistent store submissions + if ($relevant_curator eq 'Unknown' || $relevant_curator eq 'Unknown Curator' || $relevant_curator eq 'Generic Curator' || $relevant_curator eq 'P. Leyland') { + + $relevant_curator = 'FB_curator'; + + } + + + $curator_details->{curator} = $relevant_curator; + $curator_details->{timestamp} = $timestamp; + $curator_details->{currecs} = $relevant_records; # useful for debugging + + } + + + } + + return $curator_details; +} + + sub get_relevant_curator_from_candidate_list_using_pub_and_timestamp { @@ -336,8 +496,11 @@ o If there is *more than one* matching curator for the pub_id+timestamp combinat if ($relevant_curator) { - if ($relevant_curator eq 'P. Leyland') { - $relevant_curator = "FB_curator"; + + # convert all unknown style curators to the same 'FB_curator' name that is used for persistent store submissions + if ($relevant_curator eq 'Unknown' || $relevant_curator eq 'Unknown Curator' || $relevant_curator eq 'Generic Curator' || $relevant_curator eq 'P. Leyland') { + + $relevant_curator = 'FB_curator'; } diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 142b083..6937650 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -488,160 +488,3 @@ =head1 DESCRIPTION print STDERR "##Ended processing: " . (scalar localtime) . "\n"; - -############################## - - -sub get_relevant_curator_from_candidate_list { - - -=head1 SUBROUTINE: -=cut - -=head1 - - Title: get_relevant_curator_from_candidate_list - Usage: get_relevant_curator_from_candidate_list(candidate_list,pub_id); - Function: Gets relevant curator details for a pub_id specified in the second argument, from the candidate_list provided in the first argument. - Example: my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id); - -Arguments: - -candidate_list contains timestamp, curator name and curation record information for curation records of a particular type, keyed on pub_ids. - -candidate_list needs to be a data structure with the following general format, which can be generated by using the get_relevant_currec_for_datatype subroutine (this returns two references that can first be put into the candidate_list->{"by_timestamp"} and candidate_list->{"by_curator"} halves of the data structure before it is passed to get_relevant_curator_from_candidate_list. - -o $candidate_list->{"by_timestamp"}->{pub_id} - - o each pub_id refers to an array of the timestamps associated with the pub_id for the curation records, ordered by date. - -o $candidate_list->{"by_curator"}->{pub_id}->{curator}->{timestamp}->{record_number} - - - -pub_id is the pub_id of a single reference. - -Returns: - - -o if the pub_id is NOT a key in $candidate_list->{"by_timestamp"}, returns undef (i.e. there is NO curation of this particular curation type for this pub_id). - -o if the pub_id IS a key in $candidate_list->{"by_timestamp"}, returns information for the *earliest* curation record from the candidate list (getting the relevant details from $candidate_list->{"by_curator"}). - - - o $curator_details->{curator} = curator name - o $curator_details->{timestamp} = timestamp - o $curator_details->{currecs} = curation record filename - useful for debugging - -If there is more than one 'relevant' curator for the *earliest* timestamp (i.e. if multiple curation records of the same type of curation were submitted for the same pub_id in the same data load) then: - -o curator is set as follows: - - o if any of the relevant curators are FB curators - set to 'FB_curator' - - o otherwise, if any of the relevant curators are community curation - set to 'User Submission' - - o otherwise - set to 'ERROR:unable to reconcile curator' so can easily be identified as an error. - -o currecs is set to 'multiple curators for same timestamp' - - -=cut - - - unless (@_ == 2) { - - die "Wrong number of parameters passed to the get_relevant_curator_from_candidate_list subroutine\n"; - } - - - my ($data, $pub_id) = @_; - - my $curator_details = undef; - - - - if (exists $data->{"by_timestamp"}->{$pub_id}) { - - - # get the timestamp of the *earliest* matching curation record - my $timestamp = $data->{"by_timestamp"}->{$pub_id}[0]; - - # try to determine the relevant curator if appropriate for the topic - my $relevant_curator = ''; - my $relevant_records = ''; - - - if (exists $data->{"by_curator"}->{$pub_id}) { - - if (scalar keys %{$data->{"by_curator"}->{$pub_id}} == 1) { - my $curator_candidate = join '', keys %{$data->{"by_curator"}->{$pub_id}}; - - if (exists $data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$timestamp}) { - $relevant_curator = $curator_candidate; - $relevant_records = join ' ', sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$timestamp}}; - - } - } else { - - my $count = 0; - my $curator_count = 0; - my $community_curation_count = 0; - foreach my $curator_candidate (sort keys %{$data->{"by_curator"}->{$pub_id}}) { - - foreach my $candidate_timestamp (sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}}) { - - if ($candidate_timestamp eq $timestamp) { - - $relevant_curator = $curator_candidate; - $relevant_records = join ' ', sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$candidate_timestamp}}; - - unless ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission' || $relevant_curator eq 'UniProtKB') { - - $curator_count++; - } - - if ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission') { - - $community_curation_count++; - } - $count++; - - } - } - } - - unless ($count == 1) { - if ($curator_count) { - $relevant_curator = 'FB_curator'; - - } elsif ($community_curation_count) { - - $relevant_curator = 'User Submission'; - - } else { - - $relevant_curator = 'ERROR:unable to reconcile curator'; - } - $relevant_records = 'multiple curators for same timestamp'; - - } - } - } - - if ($relevant_curator) { - - $curator_details->{curator} = $relevant_curator; - $curator_details->{timestamp} = $timestamp; - $curator_details->{currecs} = $relevant_records; # useful for debugging - - } - - - } - - return $curator_details; -} - - - From dc270419d3af2a3ecef71a638ce09df8870ab095 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 15:24:19 +0000 Subject: [PATCH 20/43] deal with nocur edge case --- populate_workflow_status.pl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 6937650..fbc734c 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -370,6 +370,7 @@ =head1 DESCRIPTION } + # build reference with information for this publication+workflow type combination my $data = {}; @@ -432,6 +433,12 @@ =head1 DESCRIPTION } + # override edge cases where a curator added nocur in a user record + if ($curator eq 'Author Submission' || $curator eq 'User Submission') { + $curator = 'FB_curator'; + + } + # build reference with information for this publication+workflow type combination my $data = {}; From 0f0dc98da7fc0ccde47766c1531898e05dd503e4 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 16:18:02 +0000 Subject: [PATCH 21/43] mark high-priority papers as 'need curation' if not already curated if appropriate for workflow_tag type --- populate_workflow_status.pl | 71 +++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index fbc734c..44ed1d1 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -204,6 +204,7 @@ =head1 DESCRIPTION 'review' => '1', }, }, + 'high_priority_override' => 'ATP:0000274', # manual indexing needed }, @@ -262,13 +263,17 @@ =head1 DESCRIPTION #print Dumper ($fb_data); -# 2. get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term +# get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur|nocur_abs$'); # get publications that *do* have links to genetic objects (used for validation) my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data'); -## 3. Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl) +## diseaseHP flags to assign 'needs curation' to manual indexing where appropriate +my $diseaseHP_flags = &get_matching_pubprop_value_with_timestamps($dbh,'dis_flag','^diseaseHP$'); + + +## Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl) my $pub_id_to_FBrf = {}; my $sql_query = sprintf("select p.uniquename, p.pub_id, cvt.name from pub p, cvterm cvt where p.is_obsolete = 'f' and p.type_id = cvt.cvterm_id and cvt.is_obsolete = '0' and cvt.name in ('paper', 'erratum', 'letter', 'note', 'teaching note', 'supplementary material', 'retraction', 'personal communication to FlyBase', 'review') and p.uniquename ~'%s'", $test_FBrf); @@ -408,7 +413,7 @@ =head1 DESCRIPTION } } - # once been through all the curation record types for each worfklow_type, see if there is additional information that can be added via nocur flag + # once been through all the curation record types for each workflow_type, see if there is additional information that can be added via nocur flag if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { unless ($switch->{"$workflow_type"}) { @@ -472,6 +477,66 @@ =head1 DESCRIPTION } + # next, see if there are any non-curated papers that should be marked as 'need curation' as they are high-priority + if (exists $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}) { + + unless ($switch->{"$workflow_type"}) { + + if (exists $diseaseHP_flags->{$pub_id}) { + + my $timestamp = $diseaseHP_flags->{$pub_id}->{diseaseHP}[0]; + # set generic defaults that are overwritten later with more specific information + my $curator = 'FB_curator'; + my $curation_records = ''; + + + # get curator details for the diseaseHP flag and use to override generic defaults where possible + my $diseaseHP_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); + + if (defined $diseaseHP_details) { + + $curator = "$diseaseHP_details->{curator}"; + $curation_records = "$diseaseHP_details->{currecs}"; + + } + + my $ATP = $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}; + my $curation_tag = "ATP:0000353"; # high priority data + my $note = ''; + my $debugging_note = ''; + + # build reference with information for this publication+workflow type combination + my $data = {}; + + my $FBrf_with_prefix="FB:".$FBrf; + $data->{date_created} = $timestamp; + $data->{date_updated} = $timestamp; + $data->{created_by} = $curator; + $data->{updated_by} = $curator; + $data->{mod_abbreviation} = "FB"; + $data->{reference_curie} = $FBrf_with_prefix; + $data->{workflow_tag_id} = $ATP; + + if ($curation_tag) { + $data->{curation_tag} = $curation_tag; + } + + + if ($note) { + $data->{note} = $note; + } + + push @{$complete_data->{data}}, $data; + my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\tvia flag\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note\t$debugging_note"; + push @debugging_output, $debugging_output; + + # set the switch to indicate have set the status for this particular workflow type + $switch->{"$workflow_type"}++; + } + + } + + } } From 77e0c1cd29652273865d204c41ab5d47241293cb Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 18:36:47 +0000 Subject: [PATCH 22/43] adding harv_flag diseaseHP flags to high-priority 'needs curation' check --- populate_workflow_status.pl | 96 +++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 42 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 44ed1d1..46483da 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -246,7 +246,7 @@ =head1 DESCRIPTION my $all_curation_record_data = &get_all_currec_data($dbh); -## 1. get relevant data for deciding curation status for the various type of workflow_tag curation +## get relevant data for deciding curation status for the various type of workflow_tag curation my $fb_data = {}; @@ -263,14 +263,20 @@ =head1 DESCRIPTION #print Dumper ($fb_data); -# get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term +## get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur|nocur_abs$'); -# get publications that *do* have links to genetic objects (used for validation) +## get publications that *do* have links to genetic objects (used for validation) my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data'); -## diseaseHP flags to assign 'needs curation' to manual indexing where appropriate -my $diseaseHP_flags = &get_matching_pubprop_value_with_timestamps($dbh,'dis_flag','^diseaseHP$'); +## get diseaseHP flags so can assign manual indexing 'needs curation' where appropriate +my $diseaseHP_flags = {}; + +$diseaseHP_flags->{'dis_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'dis_flag','^diseaseHP$'); + +$diseaseHP_flags->{'harv_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'harv_flag','^diseaseHP$'); + +#print Dumper ($diseaseHP_flags); ## Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl) @@ -480,58 +486,64 @@ =head1 DESCRIPTION # next, see if there are any non-curated papers that should be marked as 'need curation' as they are high-priority if (exists $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}) { - unless ($switch->{"$workflow_type"}) { + foreach my $flag_type (sort keys %{$diseaseHP_flags}) { - if (exists $diseaseHP_flags->{$pub_id}) { + unless ($switch->{"$workflow_type"}) { - my $timestamp = $diseaseHP_flags->{$pub_id}->{diseaseHP}[0]; - # set generic defaults that are overwritten later with more specific information - my $curator = 'FB_curator'; - my $curation_records = ''; + if (exists $diseaseHP_flags->{$flag_type}->{$pub_id}) { + my $ATP = $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}; + my $curation_tag = "ATP:0000353"; # high priority data + my $note = ''; + my $debugging_note = ''; - # get curator details for the diseaseHP flag and use to override generic defaults where possible - my $diseaseHP_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); - if (defined $diseaseHP_details) { + my $timestamp = $diseaseHP_flags->{$flag_type}->{$pub_id}->{diseaseHP}[0]; + # set generic defaults that are overwritten later with more specific information + my $curator = 'FB_curator'; + my $curation_records = ''; - $curator = "$diseaseHP_details->{curator}"; - $curation_records = "$diseaseHP_details->{currecs}"; - } + # get curator details for the diseaseHP flag and use to override generic defaults where possible + my $diseaseHP_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); - my $ATP = $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}; - my $curation_tag = "ATP:0000353"; # high priority data - my $note = ''; - my $debugging_note = ''; + if (defined $diseaseHP_details) { - # build reference with information for this publication+workflow type combination - my $data = {}; + $curator = "$diseaseHP_details->{curator}"; + $curation_records = "$diseaseHP_details->{currecs}"; - my $FBrf_with_prefix="FB:".$FBrf; - $data->{date_created} = $timestamp; - $data->{date_updated} = $timestamp; - $data->{created_by} = $curator; - $data->{updated_by} = $curator; - $data->{mod_abbreviation} = "FB"; - $data->{reference_curie} = $FBrf_with_prefix; - $data->{workflow_tag_id} = $ATP; + } - if ($curation_tag) { - $data->{curation_tag} = $curation_tag; - } + # build reference with information for this publication+workflow type combination + my $data = {}; - if ($note) { - $data->{note} = $note; - } + my $FBrf_with_prefix="FB:".$FBrf; + $data->{date_created} = $timestamp; + $data->{date_updated} = $timestamp; + $data->{created_by} = $curator; + $data->{updated_by} = $curator; + $data->{mod_abbreviation} = "FB"; + $data->{reference_curie} = $FBrf_with_prefix; + $data->{workflow_tag_id} = $ATP; - push @{$complete_data->{data}}, $data; - my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\tvia flag\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note\t$debugging_note"; - push @debugging_output, $debugging_output; + if ($curation_tag) { + $data->{curation_tag} = $curation_tag; + } - # set the switch to indicate have set the status for this particular workflow type - $switch->{"$workflow_type"}++; + + if ($note) { + $data->{note} = $note; + } + + push @{$complete_data->{data}}, $data; + my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\tvia flag\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note\t$debugging_note"; + push @debugging_output, $debugging_output; + + # set the switch to indicate have set the status for this particular workflow type + $switch->{"$workflow_type"}++; + + } } } From 211cf004a572e0f817c3747a1aca1e97c551f83f Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 23 Jan 2026 19:01:53 +0000 Subject: [PATCH 23/43] first attempt at adding relevant internal note to note slto --- populate_workflow_status.pl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 46483da..c551ae8 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -191,6 +191,7 @@ =head1 DESCRIPTION '1_skim' => { 'finished_status' => 'ATP:0000330', # first pass curation finished 'relevant_record_type' => ['skim'], + 'relevant_internal_note' => 'GeneSkim|generated automatically by script from tagtog output', }, @@ -278,6 +279,20 @@ =head1 DESCRIPTION #print Dumper ($diseaseHP_flags); +## get any relevant internal notes to be added to the note + +my $relevant_internal_notes = {}; + +foreach my $workflow_type (keys %{$workflow_tag_mapping}) { + + if (exists $workflow_tag_mapping->{$workflow_type}->{'relevant_internal_note'}) { + + $relevant_internal_notes->{"$workflow_type"} = &get_matching_pubprop_value_with_timestamps($dbh,'internalnotes',$workflow_tag_mapping->{$workflow_type}->{'relevant_internal_note'}); + + } + +} + ## Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl) my $pub_id_to_FBrf = {}; @@ -347,6 +362,8 @@ =head1 DESCRIPTION my $curation_records = "$curator_details->{currecs}"; my $debugging_note = ''; + + # for manual indexing, use any nocur information to override the workflow type (to the 'won't curate' style term) # and add the appropriate 'no genetic data' curation_tag where appropriate if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { @@ -381,6 +398,10 @@ =head1 DESCRIPTION } + if (exists $relevant_internal_notes->{$workflow_type}->{$pub_id}) { + $note = $note . (join ' ', sort keys %{$relevant_internal_notes->{$workflow_type}->{$pub_id}}); + } + # build reference with information for this publication+workflow type combination my $data = {}; From af1bb96b4e509c9a28e8ac183c9019e02120cdd3 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Mon, 26 Jan 2026 16:16:16 +0000 Subject: [PATCH 24/43] change structure of reference hash to make it easier to add relevant internal notes to the note slot --- populate_workflow_status.pl | 144 +++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 68 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index c551ae8..8ff185d 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -306,8 +306,8 @@ =head1 DESCRIPTION } -my $complete_data = {}; -my @debugging_output; +my $workflow_status_data = {}; + foreach my $pub_id (sort keys %{$pub_id_to_FBrf}) { @@ -319,16 +319,6 @@ =head1 DESCRIPTION # my ($nocur_status, $nocur_timestamp, $nocur_note) = ''; - # switch for tracking whether have set the status of each workflow type already - my $switch = { - - '0_user' => 0, - '1_skim' => 0, - '2_manual_indexing' => 0, - - - }; - foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) { @@ -344,7 +334,7 @@ =head1 DESCRIPTION } } - unless ($switch->{"$workflow_type"}) { + unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { # make new subroutine to get relevant curator and timestamp from $fb_data my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id); @@ -404,36 +394,29 @@ =head1 DESCRIPTION # build reference with information for this publication+workflow type combination - my $data = {}; my $FBrf_with_prefix="FB:".$FBrf; - $data->{date_created} = $timestamp; - $data->{date_updated} = $timestamp; - $data->{created_by} = $curator; - $data->{updated_by} = $curator; - $data->{mod_abbreviation} = "FB"; - $data->{reference_curie} = $FBrf_with_prefix; - $data->{workflow_tag_id} = $ATP; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP; if ($curation_tag) { - $data->{curation_tag} = $curation_tag; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag; } if ($note) { - $data->{note} = $note; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note; } - - push @{$complete_data->{data}}, $data; - - my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note\t$debugging_note"; - push @debugging_output, $debugging_output; - - # set the switch to indicate have set the status for this particular workflow type - $switch->{"$workflow_type"}++; - + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = $relevant_record_type; } @@ -443,7 +426,7 @@ =head1 DESCRIPTION # once been through all the curation record types for each workflow_type, see if there is additional information that can be added via nocur flag if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { - unless ($switch->{"$workflow_type"}) { + unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { if ($nocur_status == 1) { @@ -473,32 +456,32 @@ =head1 DESCRIPTION # build reference with information for this publication+workflow type combination - my $data = {}; my $FBrf_with_prefix="FB:".$FBrf; - $data->{date_created} = $timestamp; - $data->{date_updated} = $timestamp; - $data->{created_by} = $curator; - $data->{updated_by} = $curator; - $data->{mod_abbreviation} = "FB"; - $data->{reference_curie} = $FBrf_with_prefix; - $data->{workflow_tag_id} = $ATP; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP; + + # if ($curation_tag) { - $data->{curation_tag} = $curation_tag; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag; } if ($note) { - $data->{note} = $note; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note; } - push @{$complete_data->{data}}, $data; - my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\tvia flag\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note\t$debugging_note"; - push @debugging_output, $debugging_output; - # set the switch to indicate have set the status for this particular workflow type - not sure need this - $switch->{"$workflow_type"}++; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = 'via flag'; + } } @@ -509,7 +492,7 @@ =head1 DESCRIPTION foreach my $flag_type (sort keys %{$diseaseHP_flags}) { - unless ($switch->{"$workflow_type"}) { + unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { if (exists $diseaseHP_flags->{$flag_type}->{$pub_id}) { @@ -537,32 +520,27 @@ =head1 DESCRIPTION # build reference with information for this publication+workflow type combination - my $data = {}; my $FBrf_with_prefix="FB:".$FBrf; - $data->{date_created} = $timestamp; - $data->{date_updated} = $timestamp; - $data->{created_by} = $curator; - $data->{updated_by} = $curator; - $data->{mod_abbreviation} = "FB"; - $data->{reference_curie} = $FBrf_with_prefix; - $data->{workflow_tag_id} = $ATP; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP; if ($curation_tag) { - $data->{curation_tag} = $curation_tag; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag; } if ($note) { - $data->{note} = $note; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note; } - - push @{$complete_data->{data}}, $data; - my $debugging_output = "DATA:$pub_id\t$FBrf\t$pub_type\tvia flag\t$curator\t$curation_records\t$ATP\t$curation_tag\t$timestamp\t$note\t$debugging_note"; - push @debugging_output, $debugging_output; - - # set the switch to indicate have set the status for this particular workflow type - $switch->{"$workflow_type"}++; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = 'from diseaseHP'; } } @@ -576,13 +554,43 @@ =head1 DESCRIPTION } -foreach my $line (sort @debugging_output) { +my $complete_data = {}; - print "$line\n"; +foreach my $pub_id (sort keys %{$workflow_status_data}) { + foreach my $workflow_type (sort keys %{$workflow_status_data->{$pub_id}}) { + + my $FBrf = $pub_id_to_FBrf->{$pub_id}->{'FBrf'}; + my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'}; + + + + #store data for making json later + push @{$complete_data->{data}}, $workflow_status_data->{$pub_id}->{$workflow_type}->{json}; + + # simple output for testing/debugging + my $curated_by = $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'created_by'} ? "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'created_by'}" : ''; + my $updated_by = $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'updated_by'} ? "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'updated_by'}" : ''; + + + my $workflow_tag_id = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'workflow_tag_id'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'workflow_tag_id'} : ''; + my $date_created = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'date_created'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'date_created'} : ''; + + + my $curation_tag = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'curation_tag'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'curation_tag'} : ''; + my $note = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'note'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'note'} : ''; + + my $curation_records = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} : ''; + my $debugging_note = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} : ''; + my $relevant_record_type = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} : ''; + + + print "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curated_by\t$curation_records\t$workflow_tag_id\t$curation_tag\t$date_created\t$note\t$debugging_note\n"; + } } + print Dumper ($complete_data); #close $json_output_file; From 5c0eeadb80ca505825d4f0a1e3a5d1c69f212876 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Mon, 26 Jan 2026 18:40:17 +0000 Subject: [PATCH 25/43] first commit of get_all_pub_internal_notes_for_tet_wf --- lib/AuditTable.pm | 121 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index ef155a7..14dbdfc 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -5,7 +5,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); -our @EXPORT = qw(get_relevant_curator get_all_currec_data get_flag_info_with_audit_data get_timestamps_for_flag_with_suffix get_timestamps_for_flaglist_with_suffix get_timestamps_for_pubprop_value get_relevant_currec_for_datatype get_matching_pubprop_value_with_timestamps get_all_flag_info_with_timestamps); +our @EXPORT = qw(get_relevant_curator get_all_currec_data get_flag_info_with_audit_data get_timestamps_for_flag_with_suffix get_timestamps_for_flaglist_with_suffix get_timestamps_for_pubprop_value get_relevant_currec_for_datatype get_matching_pubprop_value_with_timestamps get_all_flag_info_with_timestamps get_all_pub_internal_notes_for_tet_wf); =head1 MODULE: AuditTable @@ -790,3 +790,122 @@ o $audit_timestamp is a timestamp from the audit_chado table. The $audit_timesta return $data; } + + + + + +sub get_all_pub_internal_notes_for_tet_wf { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_all_pub_internal_notes_for_tet_wf + Usage: get_all_pub_internal_notes_for_tet_wf(database_handle); + Function: Gets all publication level internal notes that we want to try to submit to the Alliance ABC in either the topic-entity-tg (tet) editor or the workflow editor. + Example: my $all_pub_internal_notes_for_tet_wf = &get_all_pub_internal_notes_for_tet_wf($dbh); + + +Returns: + +The returned hash reference has the following structure: + +@{$data->{$pub_id}->{$line}}, $audit_timestamp; + + +o $pub_id is the pub_id of the reference. + +o $line is the complete internal note; if the internal note contains multiple lines they will be present in the returned $line, so this may need to be converted to spaces when submitting to the ABC if you want the note to appear as a single line. + +o $audit_timestamp is the 'I' timestamp(s) from the audit_chado table for the matching $line. The $audit_timestamp values are sorted earliest to latest within the array. + + +The subroutine only returns internal notes that we want to try to submit to the Alliance ABC (e.g. by matching the internal note timestamp to the timestamp of the appropriate topic (for the tet topics or wf curation_status) or curation record (for wf workflow_tag). + +The subroutine thus removes internal notes of two main kinds: + + +o those in $ignore - internal notes related to the bibliography that are no longer relevant + +o those in $email - email addresses (either community curation or the submitter of a personal communication) - these will be submitted elsewhere in the ABC. + + + +=cut + + + unless (@_ == 1) { + + die "Wrong number of parameters passed to the get_all_pub_internal_notes_for_tet_wf subroutine\n"; + } + + + my ($dbh, $pubprop_type, $string) = @_; + + my $data = {}; + + + my $ignore = [ + + 'Automated trawl for PMID, ggrumbli 090330', + 'BIOSIS Gene Name field: .+', + + ]; + + + my $email = [ + + '(Author|User|Submitter|submitter): ?.*?\?$', + '(Author|User|Submitter|submitter): ?.*?\? [a-z]{2}[0-9]{1,}\.$', + '(Author|User|Submitter|submitter): ?.*?\? ', + '(Author|User|Submitter|submitter): ?.*?\?$', + '(Author|User|Submitter|submitter): ?.*?\?$', + '(Author|User|Submitter|submitter) .*?\?\. ?[a-z]{2}[0-9]{1,}\.$', + '^(Author|User|Submitter|submitter):$', + + + + ]; + + + my $sql_query = sprintf("select distinct pp.pub_id, pp.value, ac.transaction_timestamp from pubprop pp, cvterm c, audit_chado ac where c.cvterm_id=pp.type_id and c.name ='internalnotes' and ac.audited_table='pubprop' and ac.audit_transaction = 'I' and pp.pubprop_id=ac.record_pkey order by ac.transaction_timestamp"); + my $db_query = $dbh->prepare($sql_query); + $db_query->execute or die "WARNING: ERROR: Unable to execute get_timestamps_for_pubprop_value query ($!)\n"; + + + while (my ($pub_id, $value, $audit_timestamp) = $db_query->fetchrow_array) { + + + foreach my $regex (@{$ignore}) { + + $value =~ s/$regex//mg; + + + } + + foreach my $regex (@{$email}) { + + $value =~ s/$regex//mg; + + + } + + $value =~ s/^ +//; + $value =~ s/ +$//; + $value =~ s/^\n+//; + + + if ($value ne '') { + + push @{$data->{$pub_id}->{$value}}, $audit_timestamp; + + } + } + + return $data; + +} + From efd4c2f7166b9b6976320c3199cae3dd910ddda0 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 27 Jan 2026 10:59:49 +0000 Subject: [PATCH 26/43] more changes to get_all_pub_internal_notes_for_tet_wf --- lib/AuditTable.pm | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index 14dbdfc..8d97596 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -805,9 +805,14 @@ sub get_all_pub_internal_notes_for_tet_wf { Title: get_all_pub_internal_notes_for_tet_wf Usage: get_all_pub_internal_notes_for_tet_wf(database_handle); - Function: Gets all publication level internal notes that we want to try to submit to the Alliance ABC in either the topic-entity-tg (tet) editor or the workflow editor. - Example: my $all_pub_internal_notes_for_tet_wf = &get_all_pub_internal_notes_for_tet_wf($dbh); + Function: Gets all publication level internal notes that we want to try to submit to the Alliance ABC in either the topic-entity-tg (tet) editor or the workflow editor, additionally filtered using the array reference provided in the second argument. + Example: my $all_pub_internal_notes_for_tet_wf = &get_all_pub_internal_notes_for_tet_wf($dbh, $additional_filters); +Arguments: + +o database handle + +o reference to an array of additional regular expressions to be used to filter out matching internal notes from the returned output. (This array reference can be empty). Returns: @@ -823,27 +828,26 @@ o $line is the complete internal note; if the internal note contains multiple li o $audit_timestamp is the 'I' timestamp(s) from the audit_chado table for the matching $line. The $audit_timestamp values are sorted earliest to latest within the array. -The subroutine only returns internal notes that we want to try to submit to the Alliance ABC (e.g. by matching the internal note timestamp to the timestamp of the appropriate topic (for the tet topics or wf curation_status) or curation record (for wf workflow_tag). - -The subroutine thus removes internal notes of two main kinds: +The subroutine only returns internal notes that we want to end up in either the topic-entity-tg (tet) editor or the workflow editor in the Alliance ABC. Internal notes of two kinds are thus removed from returned output by default, using the regular expressions in the following array references: -o those in $ignore - internal notes related to the bibliography that are no longer relevant +o $ignore - internal notes that are related to the bibliographic data and are no longer relevant. -o those in $email - email addresses (either community curation or the submitter of a personal communication) - these will be submitted elsewhere in the ABC. +o $email - email address information (either community curation or the submitter of a personal communication) - these will be submitted elsewhere in the ABC. +In addition, any regular expressions passed via the array reference in the second argument are also used to remove matching internal notes from the returned output. -=cut +=cut - unless (@_ == 1) { + unless (@_ == 2) { die "Wrong number of parameters passed to the get_all_pub_internal_notes_for_tet_wf subroutine\n"; } - my ($dbh, $pubprop_type, $string) = @_; + my ($dbh, $additional_filters) = @_; my $data = {}; @@ -852,7 +856,7 @@ o those in $email - email addresses (either community curation or the submitter 'Automated trawl for PMID, ggrumbli 090330', 'BIOSIS Gene Name field: .+', - + '^[?:]?Supplement:.+$', # there are some cases with ? or : at the beginning of the Supplement: line ]; @@ -893,6 +897,14 @@ o those in $email - email addresses (either community curation or the submitter } + foreach my $regex (@{$additional_filters}) { + + $value =~ s/$regex//mg; + + + } + + $value =~ s/^ +//; $value =~ s/ +$//; $value =~ s/^\n+//; From 760ac50a94f5efb6666e5a8e9d96e24293a94414 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 27 Jan 2026 11:37:22 +0000 Subject: [PATCH 27/43] more doc for get_all_pub_internal_notes_for_tet_wf --- lib/AuditTable.pm | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index 8d97596..f1eeac2 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -805,14 +805,14 @@ sub get_all_pub_internal_notes_for_tet_wf { Title: get_all_pub_internal_notes_for_tet_wf Usage: get_all_pub_internal_notes_for_tet_wf(database_handle); - Function: Gets all publication level internal notes that we want to try to submit to the Alliance ABC in either the topic-entity-tg (tet) editor or the workflow editor, additionally filtered using the array reference provided in the second argument. + Function: Gets all publication level internal notes that we want to try to submit to the Alliance ABC in either the topic-entity-tg (tet) editor or the workflow editor, additionally filtered using the regexes in the array reference provided in the second argument. Example: my $all_pub_internal_notes_for_tet_wf = &get_all_pub_internal_notes_for_tet_wf($dbh, $additional_filters); Arguments: o database handle -o reference to an array of additional regular expressions to be used to filter out matching internal notes from the returned output. (This array reference can be empty). +o $additional_filters - reference to an array of additional regular expressions to be used to filter the internal notes before they are returned. (This array reference can be empty). Returns: @@ -836,7 +836,12 @@ o $ignore - internal notes that are related to the bibliographic data and are no o $email - email address information (either community curation or the submitter of a personal communication) - these will be submitted elsewhere in the ABC. -In addition, any regular expressions passed via the array reference in the second argument are also used to remove matching internal notes from the returned output. +In addition, any regular expressions passed via the $additional_filters array reference are also used to filter the returned output. Note that the regexes are used to remove matching text from each internal note; any remaining text after the substitutions is returned. This strategy is necessary as in some cases, a single internal note contains multiple lines that may need different treatment when being exported to the ABC. For example, some internal notes in FB are not being submitted as free text 'notes' to the ABC, but are instead being turned into an ATP term representing a topic, topic status or a controlled tag; in these cases, the regexes in $additional_filters can be used to remove the relevant internal notes so that they are not then redundantly also submitted to the ABC as a free text note. + +This means that any regex in $additional_filters should typically be specified to either remove a *complete* line (using ^ and $ in the regex), or be sufficiently specific that it is safe to remove the text when it is part of a line (e.g. similar to the email regexes in the $email array reference below). + +(It isn't possible to just split on newline to separate out each 'row' of a given 'multiple lines' internal note, because in some cases, the multiple lines correspond to a paragraph (i.e. the multiple lines should stay together when submitted to the ABC) but in others, each row represents a different kind of internal note that may need to be split up and treated differently as they are submitted to the ABC (e.g. those that are being converted into an ATP term as described above). + =cut From b29743b505b9931206faf2ceabb5da3abc09bc59 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Wed, 28 Jan 2026 11:30:03 +0000 Subject: [PATCH 28/43] first attempt at adding internal notes to appropriate workflow_tag element --- lib/AuditTable.pm | 2 + populate_workflow_status.pl | 160 +++++++++++++++++++++++++++++++++--- 2 files changed, 151 insertions(+), 11 deletions(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index f1eeac2..9d49619 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -913,7 +913,9 @@ This means that any regex in $additional_filters should typically be specified t $value =~ s/^ +//; $value =~ s/ +$//; $value =~ s/^\n+//; + $value =~ s/\n+$//; + $value =~ s/\t//g; if ($value ne '') { diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 8ff185d..d92d39a 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -21,6 +21,8 @@ use Data::Dumper; $Data::Dumper::Sortkeys = 1; +use Encode; +binmode(STDOUT, ":utf8"); =head1 NAME populate_workflow_status.pl @@ -191,7 +193,6 @@ =head1 DESCRIPTION '1_skim' => { 'finished_status' => 'ATP:0000330', # first pass curation finished 'relevant_record_type' => ['skim'], - 'relevant_internal_note' => 'GeneSkim|generated automatically by script from tagtog output', }, @@ -279,17 +280,63 @@ =head1 DESCRIPTION #print Dumper ($diseaseHP_flags); -## get any relevant internal notes to be added to the note +my $additional_filters = [ -my $relevant_internal_notes = {}; + # Use simple regex to remove *all* 'Dataset:' lines - will be converted to a topic and/or associated free text note in another script + # The commented out lines are the regexes needed to identify the 'Dataset: pheno' lines for the topic scripts + #'^Dataset: pheno\.?$', + #'^Dataset: pheno\. ?-?[a-z]{1,} ?[0-9]{6}(\.)?$', + #'^Dataset: pheno\. [0-9]{6}[a-z]{1,}\.$', + '^(D|d)ataset:.+$', -foreach my $workflow_type (keys %{$workflow_tag_mapping}) { + # filters to remove lines that will be converted to a topic *status* and/or associated free text note in another script + '^HDM flag not applicable\.?( *[a-z]{1,}[0-9]{6})?$', + '^phen curation: only pheno_chem data in paper\.( *[a-z]{2}[0-9]{6}\.?)?$', + '^phen curation: No phenotypic data in paper\.( *[a-z]{2}[0-9]{6}\.?)?$', + '^phen_cur: CV annotations only(\. *[a-z]{2}[0-9]{6}\.?)?$', + '^phys_int not curated.+$', - if (exists $workflow_tag_mapping->{$workflow_type}->{'relevant_internal_note'}) { + # filters to remove lines that will be converted to a free text note attached to a topic in another script + '^The phys_int flag inferred from.+$', + '^The phys_int flag is inferred from.+$', + '^The phys_int flag was inferred.+$', - $relevant_internal_notes->{"$workflow_type"} = &get_matching_pubprop_value_with_timestamps($dbh,'internalnotes',$workflow_tag_mapping->{$workflow_type}->{'relevant_internal_note'}); +]; +## get any relevant internal notes to be added to the free text note slot of a workflow_tag element - first filtering out any internal notes that do not need to be added by this script (they will instead either be converted into an ATP term corresponding to a topic or controlled note in the Alliance, or added as a free text note to a specific topic), using the regexes specified in $additional_filters. +my $all_candidate_internal_notes = &get_all_pub_internal_notes_for_tet_wf($dbh, $additional_filters); + +## then remove any internal notes that come from curation records for a particular topic - these will be added as a note to the curation status of the *topic* in another script, rather than being added to the more general workflow_tag categories in this script. +# any remaining internal notes will be those either submitted under one of the workflow_tag types for this script (identified later through matching timestamp and curation record filename information) or under an 'edit' record - the latter will be added to the most appropriate workflow_tag with a note indicating it was an edit record +my @topic_record_types = ('cell_line', 'phys_int', 'DO', 'neur_exp', 'wt_exp', 'chemical', 'args', 'phen', 'humanhealth'); + +foreach my $topic_record_type (@topic_record_types) { + + my (undef, $curation_record_data) = &get_relevant_currec_for_datatype($dbh,$topic_record_type); + + + foreach my $pub_id (keys %{$all_candidate_internal_notes}) { + + foreach my $int_note (keys %{$all_candidate_internal_notes->{$pub_id}}) { + + foreach my $timestamp (@{$all_candidate_internal_notes->{$pub_id}->{$int_note}}) { + my $int_note_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); + + if (defined $int_note_details && $int_note_details->{currecs} ne 'multiple curators for same timestamp') { + + if (exists $curation_record_data->{$pub_id}->{"$int_note_details->{curator}"} && exists $curation_record_data->{$pub_id}->{"$int_note_details->{curator}"}->{$timestamp}->{"$int_note_details->{currecs}"}) { + + delete $all_candidate_internal_notes->{$pub_id}->{$int_note}; + next; + } + } + + + } } +} + + } @@ -388,10 +435,6 @@ =head1 DESCRIPTION } - if (exists $relevant_internal_notes->{$workflow_type}->{$pub_id}) { - $note = $note . (join ' ', sort keys %{$relevant_internal_notes->{$workflow_type}->{$pub_id}}); - } - # build reference with information for this publication+workflow type combination @@ -554,6 +597,100 @@ =head1 DESCRIPTION } +# try to add relevant publication-level internal notes where appropriates + +foreach my $pub_id (sort keys %{$all_candidate_internal_notes}) { + + if (exists $workflow_status_data->{$pub_id}) { + + foreach my $int_note (sort keys %{$all_candidate_internal_notes->{$pub_id}}) { + + my $switch = 0; + my $reformatted_note = "$int_note"; + $reformatted_note =~ s/\n/ /g; + + # for internal notes with a single timestamp + if (scalar @{$all_candidate_internal_notes->{$pub_id}->{$int_note}} == 1) { + + + my $int_note_timestamp = join '', @{$all_candidate_internal_notes->{$pub_id}->{$int_note}}; + + + # 1. go through the different workflow_tags to find cases where both the timestamp and curation records show a match + foreach my $workflow_type (sort keys %{$workflow_status_data->{$pub_id}}) { + + unless ($switch) { + my $workflow_type_timestamp = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created}"; + my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}"; + my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}"; + + #print "HERE2: workflow: $workflow_type_currecs, $workflow_type_timestamp\n"; + + # 2. if the timestamps of the workflow tag and internal note match + if ($int_note_timestamp eq $workflow_type_timestamp) { + + my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); + + + # 3. get the curation record details for the internal note to check against those of workflow_tag + if (defined $int_note_curator_details) { + + my $int_note_currecs = "$int_note_curator_details->{currecs}"; + my $int_note_curator = "$int_note_curator_details->{curator}"; + + + #print "HERE: internal note: $int_note_currecs, $int_note_timestamp\n"; + + # 4. if the curation record for the workflow type is in the list of possibilities for the internal note + # and there is only one curator possibility for that timestamp + # then the internal note can be added to the entry for that workflow tag + if ($int_note_currecs =~ m/$workflow_type_currecs/ && $int_note_currecs ne 'multiple curators for same timestamp') { + + $switch++; + + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + + + my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$note||$int_note"; + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$int_note"; + + } + } + + } + + } + + } + } + + + } else { + + print "WARNING: internal note(s) with MULTIPLE TIMESTAMPS: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; + + + } + + unless ($switch) { + + print "WARNING: internal note(s) that could not match up: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; + + + } + } + } +} + + +#print Dumper ($workflow_status_data); + my $complete_data = {}; @@ -580,6 +717,7 @@ =head1 DESCRIPTION my $curation_tag = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'curation_tag'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'curation_tag'} : ''; my $note = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'note'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'note'} : ''; + $note =~ s/\n/ /g; my $curation_records = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} : ''; my $debugging_note = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} : ''; @@ -590,9 +728,9 @@ =head1 DESCRIPTION } } - print Dumper ($complete_data); + #close $json_output_file; #close $data_error_file; #close $process_error_file; From 931ad21540d3f6c2bd54c949bd4511161ad0045f Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 30 Jan 2026 10:09:49 +0000 Subject: [PATCH 29/43] tightening up regex for authors --- lib/AuditTable.pm | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index 9d49619..556e14d 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -869,7 +869,6 @@ This means that any regex in $additional_filters should typically be specified t '(Author|User|Submitter|submitter): ?.*?\?$', '(Author|User|Submitter|submitter): ?.*?\? [a-z]{2}[0-9]{1,}\.$', - '(Author|User|Submitter|submitter): ?.*?\? ', '(Author|User|Submitter|submitter): ?.*?\?$', '(Author|User|Submitter|submitter): ?.*?\?$', '(Author|User|Submitter|submitter) .*?\?\. ?[a-z]{2}[0-9]{1,}\.$', From 5076882c0f5d0e8ee2f378b698be034305c61064 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 30 Jan 2026 13:31:21 +0000 Subject: [PATCH 30/43] another email regex tweak --- lib/AuditTable.pm | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index 556e14d..5d889ff 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -872,6 +872,10 @@ This means that any regex in $additional_filters should typically be specified t '(Author|User|Submitter|submitter): ?.*?\?$', '(Author|User|Submitter|submitter): ?.*?\?$', '(Author|User|Submitter|submitter) .*?\?\. ?[a-z]{2}[0-9]{1,}\.$', + + # remove email info from beginning of line - require <> around email part of regex else is not strict enough + '(Author|User|Submitter|submitter): .*?\<\S+?@\S+?\>\. ', + '^(Author|User|Submitter|submitter):$', From 435efc2a2960a16f1afd7a8e2dd6bea56bd449de Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 30 Jan 2026 14:26:10 +0000 Subject: [PATCH 31/43] adding filter for cell_line note --- populate_workflow_status.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index d92d39a..40a06b0 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -300,6 +300,7 @@ =head1 DESCRIPTION '^The phys_int flag inferred from.+$', '^The phys_int flag is inferred from.+$', '^The phys_int flag was inferred.+$', + '^FTYP cell line:.+$', ]; From 8e85b9379d8f2522bd0638af6bb8d073952455b4 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Fri, 30 Jan 2026 15:26:54 +0000 Subject: [PATCH 32/43] adding clean_note to clean note text formatting --- lib/Util.pm | 132 +++++++++++++++++++++++++++++++++++- populate_workflow_status.pl | 6 +- 2 files changed, 134 insertions(+), 4 deletions(-) diff --git a/lib/Util.pm b/lib/Util.pm index ce82b4c..25e558a 100755 --- a/lib/Util.pm +++ b/lib/Util.pm @@ -5,7 +5,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); -our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data get_relevant_curator_from_candidate_list get_relevant_curator_from_candidate_list_using_pub_and_timestamp check_and_validate_nocur); +our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data get_relevant_curator_from_candidate_list get_relevant_curator_from_candidate_list_using_pub_and_timestamp check_and_validate_nocur clean_note); use JSON::PP; @@ -589,3 +589,133 @@ $nocur_note: free text note that can be added in the 'note' slot of the relevant } + + + + +sub clean_note { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: clean_note + Function: Subroutine that 'cleans' text that is being submitted to the Alliance ABC as an internal 'note'. + Example: &clean_note($value); + +The conversions done are: + +- removes any tabs + +- converts superscripts/subscripts to curator-friendly form + + +- uses degreek subroutine to convert any greek symbols in FB 'sgml' format to spelt out greek name. + + +Does NOT substitute returns (because many of the internal notes being submitted are in multiline format which needs to be preserved). + + +=cut + unless (@_ == 1) { + + die "Wrong number of parameters passed to the clean_free_text subroutine\n"; + } + my ($string) = @_; + + if ($string =~ m/\t/) { + $string =~ s/\t/ /g; + } + + $string =~ s/\<\/down\>/\]\]/g; + $string =~ s/\/\[\[/g; + $string =~ s/\/\[/g; + $string =~ s/\<\/up\>/\]/g; + + $string = °reek($string); + + return $string; + +} + + + +sub degreek { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: degreek + Function: Subroutine that converts sgml greek format to spelt out greek name. + + +=cut + + + my $symbol = $_[0]; + + my %greek = ( + '&Agr;' => 'Alpha', + '&Bgr;' => 'Beta', + '&Ggr;' => 'Gamma', + '&Dgr;' => 'Delta', + '&Egr;' => 'Epsilon', + '&Zgr;' => 'Zeta', + '&EEgr;' => 'Eta', + '&THgr;' => 'Theta', + '&Igr;' => 'Iota', + '&Kgr;' => 'Kappa', + '&Lgr;' => 'Lambda', + '&Mgr;' => 'Mu', + '&Ngr;' => 'Nu', + '&Xgr;' => 'Xi', + '&Ogr;' => 'Omicron', + '&Pgr;' => 'Pi', + '&Rgr;' => 'Rho', + '&Sgr;' => 'Sigma', + '&Tgr;' => 'Tau', + '&Ugr;' => 'Upsilon', + '&PHgr;' => 'Phi', + '&KHgr;' => 'Chi', + '&PSgr;' => 'Psi', + '&OHgr;' => 'Omega', + '&agr;' => 'alpha', + '&bgr;' => 'beta', + '&ggr;' => 'gamma', + '&dgr;' => 'delta', + '&egr;' => 'epsilon', + '&zgr;' => 'zeta', + '&eegr;' => 'eta', + '&thgr;' => 'theta', + '&igr;' => 'iota', + '&kgr;' => 'kappa', + '&lgr;' => 'lambda', + '&mgr;' => 'mu', + '&ngr;' => 'nu', + '&xgr;' => 'xi', + '&ogr;' => 'omicron', + '&pgr;' => 'pi', + '&rgr;' => 'rho', + '&sgr;' => 'sigma', + '&tgr;' => 'tau', + '&ugr;' => 'upsilon', + '&phgr;' => 'phi', + '&khgr;' => 'chi', + '&psgr;' => 'psi', + '&ohgr;' => 'omega', + ); + + + $symbol =~ s/(&[a-z]{1,2}gr;)/$greek{$1}?"$greek{$1}":"$1"/egi; + + $symbol =~ s/∩/INTERSECTION/g; # conversion needed to make 'plain' symbol for any FBco symbols in note. + + return $symbol; + +} + diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 40a06b0..96a783e 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -607,7 +607,7 @@ =head1 DESCRIPTION foreach my $int_note (sort keys %{$all_candidate_internal_notes->{$pub_id}}) { my $switch = 0; - my $reformatted_note = "$int_note"; + my $reformatted_note = &clean_note("$int_note"); $reformatted_note =~ s/\n/ /g; # for internal notes with a single timestamp @@ -654,12 +654,12 @@ =head1 DESCRIPTION my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; - $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$note||$int_note"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$note||$int_note"); } else { - $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$int_note"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note"); } } From 1d99ef6e734c7b0c645e2439eeb7d66b633f0bf7 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Sun, 1 Feb 2026 16:57:36 +0000 Subject: [PATCH 33/43] another filter --- populate_workflow_status.pl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 96a783e..de7326e 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -295,6 +295,7 @@ =head1 DESCRIPTION '^phen curation: No phenotypic data in paper\.( *[a-z]{2}[0-9]{6}\.?)?$', '^phen_cur: CV annotations only(\. *[a-z]{2}[0-9]{6}\.?)?$', '^phys_int not curated.+$', + '^FTA: DOcur genotype - need to check for missing drivers$', # filters to remove lines that will be converted to a free text note attached to a topic in another script '^The phys_int flag inferred from.+$', @@ -302,6 +303,7 @@ =head1 DESCRIPTION '^The phys_int flag was inferred.+$', '^FTYP cell line:.+$', + ]; ## get any relevant internal notes to be added to the free text note slot of a workflow_tag element - first filtering out any internal notes that do not need to be added by this script (they will instead either be converted into an ATP term corresponding to a topic or controlled note in the Alliance, or added as a free text note to a specific topic), using the regexes specified in $additional_filters. @@ -616,7 +618,7 @@ =head1 DESCRIPTION my $int_note_timestamp = join '', @{$all_candidate_internal_notes->{$pub_id}->{$int_note}}; - + # A. first pass - go through different workflow types, from least to most detailed curation, and match up internal notes using timestamp and curation record info # 1. go through the different workflow_tags to find cases where both the timestamp and curation records show a match foreach my $workflow_type (sort keys %{$workflow_status_data->{$pub_id}}) { From d3efa130b5b4761090be88b54821760abd2d9654 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Sun, 1 Feb 2026 17:19:53 +0000 Subject: [PATCH 34/43] another filter --- populate_workflow_status.pl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index de7326e..9ef8140 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -303,6 +303,9 @@ =head1 DESCRIPTION '^The phys_int flag was inferred.+$', '^FTYP cell line:.+$', + # filter to remove lines that would be better converted into a note when submit curation record filename info. + '^Curation record .*? is to add the allele phendesc data originally curated in .+$', + ]; From 5d5abf7a8321055cfd0464006612db25d7ccbb32 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Mon, 2 Feb 2026 11:38:59 +0000 Subject: [PATCH 35/43] adding loop to try to add internal notes that do not match by timestamp --- populate_workflow_status.pl | 74 +++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 9ef8140..0782684 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -464,9 +464,12 @@ =head1 DESCRIPTION $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; - $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = $relevant_record_type; + if ($debugging_note) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + } + } } @@ -528,9 +531,12 @@ =head1 DESCRIPTION $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; - $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = 'via flag'; + if ($debugging_note) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + + } } } @@ -588,9 +594,11 @@ =head1 DESCRIPTION $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note; } $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; - $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = 'from diseaseHP'; + if ($debugging_note) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + } } } @@ -677,6 +685,66 @@ =head1 DESCRIPTION } + # B. second pass, if the internal note did not match in the loop above, add it to the note for manual indexing if that workflow_tag exists, so that the information gets into the Alliance + # (Many of the non-matching notes are edits to original curation, so makes sense to add here). + unless ($switch) { + + my $workflow_type = "2_manual_indexing"; + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { + + my $workflow_type_timestamp = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created}"; + my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}"; + my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}"; + + my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); + + if (defined $int_note_curator_details) { + + my $int_note_currecs = "$int_note_curator_details->{currecs}"; + my $int_note_curator = "$int_note_curator_details->{curator}"; + + my $debugging_note = "ADDED in second pass: $int_note"; + $debugging_note =~ s/\n/ /g; + + $switch++; + + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + + + my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$note||$int_note"); + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note"); + + } + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}) { + + + my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$existing_note||$debugging_note"); + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$debugging_note"); + + } + + + } + + } + } + + + } else { print "WARNING: internal note(s) with MULTIPLE TIMESTAMPS: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; From e155f1a0a2b0d17d888617cd0856845a90be1e2a Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Mon, 2 Feb 2026 11:43:47 +0000 Subject: [PATCH 36/43] adding another filter --- populate_workflow_status.pl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 0782684..a5daee4 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -306,6 +306,9 @@ =head1 DESCRIPTION # filter to remove lines that would be better converted into a note when submit curation record filename info. '^Curation record .*? is to add the allele phendesc data originally curated in .+$', + # filter to remove preliminary data that will not be submitted to the Alliance + '^HDM flag future.+$', + ]; From 9b4001ff2bc43563268c1a13c2c200701a4a7e35 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Mon, 2 Feb 2026 11:57:00 +0000 Subject: [PATCH 37/43] adding another filter --- populate_workflow_status.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index a5daee4..3e01b20 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -305,6 +305,8 @@ =head1 DESCRIPTION # filter to remove lines that would be better converted into a note when submit curation record filename info. '^Curation record .*? is to add the allele phendesc data originally curated in .+$', + '^Curation record .*? is to fix data for MI4 .+$', + '^Curation record .*? generated by hand to fix MI4 ticket.+$', # filter to remove preliminary data that will not be submitted to the Alliance '^HDM flag future.+$', From 047fe46f5c4507b261a077538df2cf5ec54f3258 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Mon, 2 Feb 2026 16:33:03 +0000 Subject: [PATCH 38/43] adding second pass to get more internal notes in --- populate_workflow_status.pl | 98 +++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 32 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 3e01b20..500f475 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -193,6 +193,7 @@ =head1 DESCRIPTION '1_skim' => { 'finished_status' => 'ATP:0000330', # first pass curation finished 'relevant_record_type' => ['skim'], + 'second_pass' => '1', }, @@ -207,6 +208,7 @@ =head1 DESCRIPTION }, }, 'high_priority_override' => 'ATP:0000274', # manual indexing needed + 'second_pass' => '1', }, @@ -645,12 +647,11 @@ =head1 DESCRIPTION #print "HERE2: workflow: $workflow_type_currecs, $workflow_type_timestamp\n"; + my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); + # 2. if the timestamps of the workflow tag and internal note match if ($int_note_timestamp eq $workflow_type_timestamp) { - my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); - - # 3. get the curation record details for the internal note to check against those of workflow_tag if (defined $int_note_curator_details) { @@ -684,72 +685,105 @@ =head1 DESCRIPTION } + } else { + + # if the timestamps do not match, but the curation record filename matches the type, its still OK to add the note + foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) { + + unless ($switch) { + + if (defined $int_note_curator_details) { + if ($int_note_curator_details->{currecs} =~ m/$relevant_record_type/) { + + $switch++; + + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + + + my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$existing_note||$int_note"); + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note"); + } + } + + } + + } + } + } } } - # B. second pass, if the internal note did not match in the loop above, add it to the note for manual indexing if that workflow_tag exists, so that the information gets into the Alliance - # (Many of the non-matching notes are edits to original curation, so makes sense to add here). - unless ($switch) { + # B. second pass, if the internal note did not match in the loop above, add it to the note for the workflow_type if appropriate, so that the information gets into the Alliance + # (Many of the non-matching notes are edits to original curation, so makes sense to add to either skim/manual_indexing workflow types). + foreach my $workflow_type (reverse sort keys %{$workflow_tag_mapping}) { - my $workflow_type = "2_manual_indexing"; + if (exists $workflow_tag_mapping->{$workflow_type}->{'second_pass'}) { - if (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { + unless ($switch) { - my $workflow_type_timestamp = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created}"; - my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}"; - my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}"; + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { - my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); + my $workflow_type_timestamp = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created}"; + my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}"; + my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}"; - if (defined $int_note_curator_details) { + my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); - my $int_note_currecs = "$int_note_curator_details->{currecs}"; - my $int_note_curator = "$int_note_curator_details->{curator}"; + if (defined $int_note_curator_details) { - my $debugging_note = "ADDED in second pass: $int_note"; - $debugging_note =~ s/\n/ /g; + my $int_note_currecs = "$int_note_curator_details->{currecs}"; + my $int_note_curator = "$int_note_curator_details->{curator}"; - $switch++; + my $debugging_note = "ADDED in second pass: $int_note ($int_note_curator, $int_note_currecs)"; + $debugging_note =~ s/\n/ /g; + $switch++; - if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { - my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; - $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$note||$int_note"); + my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$note||$int_note"); - } else { - $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note"); + } else { - } + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note"); - if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}) { + } + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}) { - my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}"; - $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$existing_note||$debugging_note"); + my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$existing_note||$debugging_note"); - } else { - $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$debugging_note"); + } else { - } + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$debugging_note"); + + } + } + } } - } } - } else { print "WARNING: internal note(s) with MULTIPLE TIMESTAMPS: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; From b1b2d62470327d383926b6f9c6061ddaadaed9bc Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Mon, 2 Feb 2026 16:52:20 +0000 Subject: [PATCH 39/43] tidying --- populate_workflow_status.pl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 500f475..fbbb674 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -374,9 +374,7 @@ =head1 DESCRIPTION my $FBrf = $pub_id_to_FBrf->{$pub_id}->{'FBrf'}; my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'}; - # maybe do nocur validation and record status - need to make new subroutine my ($nocur_status, $nocur_timestamp, $nocur_note) = &check_and_validate_nocur($nocur_flags, $has_genetic_data, $pub_id); -# my ($nocur_status, $nocur_timestamp, $nocur_note) = ''; foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) { @@ -645,7 +643,6 @@ =head1 DESCRIPTION my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}"; my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}"; - #print "HERE2: workflow: $workflow_type_currecs, $workflow_type_timestamp\n"; my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); @@ -659,8 +656,6 @@ =head1 DESCRIPTION my $int_note_curator = "$int_note_curator_details->{curator}"; - #print "HERE: internal note: $int_note_currecs, $int_note_timestamp\n"; - # 4. if the curation record for the workflow type is in the list of possibilities for the internal note # and there is only one curator possibility for that timestamp # then the internal note can be added to the entry for that workflow tag From 8412299ffb5e31a907b29cb966d055752174c258 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Mon, 2 Feb 2026 17:11:33 +0000 Subject: [PATCH 40/43] adding printing to output files, printing json, first attempt at test mode json submission to ABC --- populate_workflow_status.pl | 79 +++++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index fbbb674..01335ea 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -231,18 +231,22 @@ =head1 DESCRIPTION # open output and error logging files -#open my $json_output_file, '>', "FB_workflow_status_data.json" -# or die "Can't open json output file ($!)\n"; +open my $json_output_file, '>', "FB_workflow_status_data.json" + or die "Can't open json output file ($!)\n"; +binmode($json_output_file, ":utf8"); -#open my $data_error_file, '>', "FB_workflow_status_data_errors.err" -# or die "Can't open data error logging file ($!)\n"; +open my $data_error_file, '>', "FB_workflow_status_data_errors.err" + or die "Can't open data error logging file ($!)\n"; +binmode($data_error_file, ":utf8"); -#open my $process_error_file, '>', "FB_workflow_status_process_errors.err" -# or die "Can't open processing error logging file ($!)\n"; +open my $process_error_file, '>', "FB_workflow_status_process_errors.err" + or die "Can't open processing error logging file ($!)\n"; +binmode($process_error_file, ":utf8"); -#open my $plain_output_file, '>', "FB_workflow_status_data.txt" -# or die "Can't open plain output file ($!)\n"; +open my $plain_output_file, '>', "FB_workflow_status_data.txt" + or die "Can't open plain output file ($!)\n"; +binmode($plain_output_file, ":utf8"); print STDERR "##Starting processing: " . (scalar localtime) . "\n"; @@ -781,14 +785,14 @@ =head1 DESCRIPTION } else { - print "WARNING: internal note(s) with MULTIPLE TIMESTAMPS: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; + print $data_error_file "WARNING: internal note(s) with MULTIPLE TIMESTAMPS: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; } unless ($switch) { - print "WARNING: internal note(s) that could not match up: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; + print $data_error_file "WARNING: internal note(s) that could not match up: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; } @@ -832,17 +836,60 @@ =head1 DESCRIPTION my $relevant_record_type = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} : ''; - print "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curated_by\t$curation_records\t$workflow_tag_id\t$curation_tag\t$date_created\t$note\t$debugging_note\n"; + print $plain_output_file "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curated_by\t$curation_records\t$workflow_tag_id\t$curation_tag\t$date_created\t$note\t$debugging_note\n"; } } -print Dumper ($complete_data); +#print Dumper ($complete_data); -#close $json_output_file; -#close $data_error_file; -#close $process_error_file; -#close $plain_output_file; +unless ($ENV_STATE eq "test") { + + my $json_metadata = &make_abc_json_metadata($db, $api_endpoint); + + $complete_data->{"metaData"} = $json_metadata; + my $complete_json_data = $json_encoder->encode($complete_data); + + print $json_output_file $complete_json_data; + + +} else { + + + foreach my $element (@{$complete_data->{"data"}}) { + + my $json_element = $json_encoder->encode($element); + + + my $cmd="curl -X 'POST' 'https://stage-literature-rest.alliancegenome.org/$api_endpoint/' -H 'accept: application/json' -H 'Authorization: Bearer $access_token' -H 'Content-Type: application/json' -d '$json_element'"; + my $raw_result = `$cmd`; + + + + if ($raw_result =~ m/^\d+$/) { + + print $plain_output_file "json post success\nJSON:\n$json_element\n\n"; + + } else { + + print $process_error_file "json post failed\nJSON:\n$json_element\nREASON:\n$raw_result\n#################################\n\n"; + + } + + + } + + + + +} + + + +close $json_output_file; +close $data_error_file; +close $process_error_file; +close $plain_output_file; print STDERR "##Ended processing: " . (scalar localtime) . "\n"; From 2b124dbd002953b5a107270e521b7cf988e3c930 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 3 Feb 2026 08:22:56 +0000 Subject: [PATCH 41/43] doc tweak --- populate_workflow_status.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 01335ea..9804ee1 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -102,7 +102,7 @@ =head1 DESCRIPTION o json output file (FB_workflow_status_data.json) containing a single json structure for all the data (manual_indexing status data is a set of arrays within a 'data' object, plus there is a 'metaData' object to indicate source). Data is printed to this file in all modes except 'test'. - o 'plain' output file (FB_workflow_status_data.txt) to aid in debugging - prints the same data as in the json file, but with a single 'DATA:' tsv row for each FBrf+topic combination. Data is printed to this file in all modes except 'production'. In 'test' mode + o 'plain' output file (FB_workflow_status_data.txt) to aid in debugging - prints the same data as in the json file, but with a single 'DATA:' tsv row for each FBrf+topic combination. Data is printed to this file in all modes. o FB_workflow_status_data_errors.err - errors in mapping FlyBase data to appropriate Alliance json are printed in this file. Data is printed to this file in all modes. @@ -586,6 +586,7 @@ =head1 DESCRIPTION # build reference with information for this publication+workflow type combination my $FBrf_with_prefix="FB:".$FBrf; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp; $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp; $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator; From ea3561162d24eafa420a9e956a7687f4026ac2e7 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 3 Feb 2026 08:49:46 +0000 Subject: [PATCH 42/43] doc outlining script logic --- populate_workflow_status.pl | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 9804ee1..8ab431f 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -66,6 +66,27 @@ =head1 DESCRIPTION (NB: In the Alliance, curation status for individual datatypes (Alliance name: 'topics') are stored in 'curation_status', NOT 'workflow_tag', so FB curation status info for these types of curation (e.g. phenotype, physical interactions) are not dealt with by this script, but instead by populate_topic_curation_status.pl). +Script logic: + + +o Uses FB 'curated_by' pubprop information to determine the curation status of the three FB curation types being mapped to workflow_tag. Sets curation status to 'done' when a file of the standard expected filename format is found for a given curation type. + +o Uses the 'nocur' flag to identify papers that contain 'no genetic information'. Validates that the nocur flag is correct and then sets manual_indexing status to 'won't curate' (with 'no genetic information' curation_tag), overriding any 'done' status added in the first step above. + +o Identifies papers have not yet been manually indexed, but which contain high-priority data. Sets curation status to 'curation needed' for manual indexing, with a note explaining why the paper is high priority. + + +o Adds publication-level internal notes to the 'note' of the appropriate workflow_tag curation type + + o first filters out internal notes that are either not being submitted to the Alliance or will be submitted in a different script (e.g. attached either to a topic or a topic curation status). + + o uses the internal note timestamp to identify which of the three workflow_tag curation types to add the internal note to. + + o For any internal notes where the timestamp did not match any of the workflow_tag timestamps for that publication (can happen if the note was added as an edit record), add it to the manual indexing status (if that exists), or then the first-pass curation status (if that exists). + + o Any internal notes that have not been matched up and added in the above steps are printed in the FB_workflow_status_data_errors.err file. + + Script has three modes: o dev mode @@ -398,10 +419,9 @@ =head1 DESCRIPTION unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { - # make new subroutine to get relevant curator and timestamp from $fb_data my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id); - # if there is a matching record for the workflow type, make a json structure for submitting data to the Alliance + # if there is a matching record for the workflow type, store the information for submitting data to the Alliance if (defined $curator_details) { From ca0c217a7099f6776cda925c6d8b2591e66a8118 Mon Sep 17 00:00:00 2001 From: Gillian Millburn Date: Tue, 3 Feb 2026 09:17:46 +0000 Subject: [PATCH 43/43] adding wt_exp needs expression to high priority --- populate_workflow_status.pl | 54 ++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl index 8ab431f..18b8464 100644 --- a/populate_workflow_status.pl +++ b/populate_workflow_status.pl @@ -299,13 +299,27 @@ =head1 DESCRIPTION my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data'); ## get diseaseHP flags so can assign manual indexing 'needs curation' where appropriate -my $diseaseHP_flags = {}; +my $high_priority_flags = {}; -$diseaseHP_flags->{'dis_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'dis_flag','^diseaseHP$'); +$high_priority_flags->{'diseaseHP_dis_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'dis_flag','^diseaseHP$'); -$diseaseHP_flags->{'harv_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'harv_flag','^diseaseHP$'); +$high_priority_flags->{'diseaseHP_harv_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'harv_flag','^diseaseHP$'); -#print Dumper ($diseaseHP_flags); +$high_priority_flags->{'wt_exp_needs_curation'} = &get_matching_pubprop_value_with_timestamps($dbh,'harv_flag','^wt_exp::Needs cam curation$'); + + +my $high_priority_mapping = { + + + 'diseaseHP_dis_flag' => 'diseaseHP', + 'diseaseHP_harv_flag' => 'diseaseHP', + 'wt_exp_needs_curation' => 'wt_exp::Needs cam curation', + + + +}; + +#print Dumper ($high_priority_flags); my $additional_filters = [ @@ -574,31 +588,32 @@ =head1 DESCRIPTION # next, see if there are any non-curated papers that should be marked as 'need curation' as they are high-priority if (exists $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}) { - foreach my $flag_type (sort keys %{$diseaseHP_flags}) { + foreach my $flag_type (sort keys %{$high_priority_flags}) { unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { - if (exists $diseaseHP_flags->{$flag_type}->{$pub_id}) { + if (exists $high_priority_flags->{$flag_type}->{$pub_id}) { my $ATP = $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}; my $curation_tag = "ATP:0000353"; # high priority data - my $note = ''; + + my $note = "$high_priority_mapping->{$flag_type}"; my $debugging_note = ''; - my $timestamp = $diseaseHP_flags->{$flag_type}->{$pub_id}->{diseaseHP}[0]; + my $timestamp = $high_priority_flags->{$flag_type}->{$pub_id}->{"$high_priority_mapping->{$flag_type}"}[0]; # set generic defaults that are overwritten later with more specific information my $curator = 'FB_curator'; my $curation_records = ''; - # get curator details for the diseaseHP flag and use to override generic defaults where possible - my $diseaseHP_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); + # get curator details for the high priority flag and use to override generic defaults where possible + my $high_priority_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); - if (defined $diseaseHP_details) { + if (defined $high_priority_details) { - $curator = "$diseaseHP_details->{curator}"; - $curation_records = "$diseaseHP_details->{currecs}"; + $curator = "$high_priority_details->{curator}"; + $curation_records = "$high_priority_details->{currecs}"; } @@ -620,11 +635,18 @@ =head1 DESCRIPTION } - if ($note) { - $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note; + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + + my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$existing_note, $note"; + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$note"; + } $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; - $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = 'from diseaseHP'; if ($debugging_note) { $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note;