diff --git a/lib/AuditTable.pm b/lib/AuditTable.pm index 4ea5e39..5d889ff 100755 --- a/lib/AuditTable.pm +++ b/lib/AuditTable.pm @@ -5,7 +5,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); -our @EXPORT = qw(get_relevant_curator get_flag_info_with_audit_data get_timestamps_for_flag_with_suffix get_timestamps_for_flaglist_with_suffix get_timestamps_for_pubprop_value get_relevant_currec_for_datatype get_matching_pubprop_value_with_timestamps get_all_flag_info_with_timestamps); +our @EXPORT = qw(get_relevant_curator get_all_currec_data get_flag_info_with_audit_data get_timestamps_for_flag_with_suffix get_timestamps_for_flaglist_with_suffix get_timestamps_for_pubprop_value get_relevant_currec_for_datatype get_matching_pubprop_value_with_timestamps get_all_flag_info_with_timestamps get_all_pub_internal_notes_for_tet_wf); =head1 MODULE: AuditTable @@ -137,6 +137,69 @@ sub get_relevant_curator { return ($data); } +sub get_all_currec_data { + + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_all_currec_data + Usage: get_all_currec_data(database_handle); + Function: Stores curator and corresponding curation_record_filename information for all 'curated_by' pubprops, keyed on pub_id. + Example: my $all_curation_record_data = &get_all_currec_data($dbh); + +Arguments: database handle + +Returns: data structure with the following format: + +$data->{$pub_id}->{$timestamp}->{$curator}->{$curation_record_filename} + + +The get_relevant_curator_from_candidate_list_using_pub_and_timestamp in lib/Util.pm can be used to filter the returned list for the curator (and corresponding curation_record_filename information) that matches a specific pub_id plus timestamp combination. + + +=cut + + + unless (@_ == 1) { + + die "Wrong number of parameters passed to the get_all_currec_data subroutine\n"; + } + + my ($dbh) = @_; + + my $data = {}; + + my $sql_query=sprintf("select distinct pp.pub_id, pp.value, ac.transaction_timestamp from pubprop pp, cvterm c, audit_chado ac where ac.audited_table='pubprop' and ac.audit_transaction='I' and pp.pubprop_id=ac.record_pkey and c.cvterm_id=pp.type_id and c.name = 'curated_by' order by ac.transaction_timestamp"); + + my $db_query = $dbh->prepare($sql_query); + $db_query->execute or die "WARNING: ERROR: Unable to execute get_all_currec_data query ($!)\n"; + + + while (my ($pub_id, $curated_by_value, $timestamp) = $db_query->fetchrow_array) { + + + if ($curated_by_value =~ m/^Curator: (.+?);Proforma: (.+?);timelastmodified: (.*)$/) { + + my $curator = $1; + my $curation_record_filename = $2; + my $timelastmodified = $3; + + $data->{$pub_id}->{$timestamp}->{$curator}->{$curation_record_filename}++; + + } else { + # not expecting to trip this error + print "ERROR: wrong curated_by pubprop format for pub_id: $pub_id, pubprop: $curated_by_value\n"; + } + } + + return $data; +} + + sub get_flag_info_with_audit_data { @@ -470,7 +533,7 @@ sub get_relevant_currec_for_datatype { Title: get_relevant_currec_for_datatype Usage: get_relevant_currec_for_datatype(database_handle,datatype); - Function: Gets timestamp information for curation records that are expected to contain data of a particular type based on the their filename, for the datatype specified in the second argument. + Function: Gets timestamp information for curation records that are expected to contain data/curation of a particular type based on the their filename, for the data/curation type specified in the second argument. Example: my $currec_for_cell_line = &get_relevant_currec_for_datatype($dbh,'cell_line'); Arguments: @@ -488,6 +551,7 @@ Arguments: # only use standard filenames for matching so that can be sure that curation of the datatype is complete my $datatype_mapping = { + # filename types used for topic curation status 'cell_line' => '.+?\.(cell|cell_multiple|cell_multi)\..+?', 'phys_int' => '.+?\.(int|int_miRNA)\..+?', 'DO' => '.+?\.DO\..+?', @@ -496,8 +560,15 @@ Arguments: 'chemical' => '.+?\.chem\..+?', 'args' => '.+?\.args\..+?', 'phen' => '[a-z][a-z][0-9]{1,}\.phen', - 'cam_full' => '(ma|sb|rd|sf|tj|al|sm|cm|pm|gm|ao|cp|lp|sp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?', 'humanhealth' => '.+?\.(hh|hh_multiple|hds|hds_multiple|hh_[0-9]{1,}|hds_rvw)\..+?', + # filename types used for workflow status + 'thin' => '[a-z][a-z][0-9]{1,}\.thin', + 'user' => '[a-z][a-z][0-9]{1,}\.user', + 'skim' => '.+?\.(skim|skim_multiple|skim_multi)(\..+)?', + 'gene_full' => '[a-z][a-z][0-9]{1,}\.full', # publications that have had full manual indexing (listing) of genes, and genes are the only entity type relevant to manual indexing in the publication + 'cam_no_suffix' => '(ma|sb|rd|sf|tj|kk|al|sm|cm|pm|gm|ao|cp|lp|sp|sr|rs|ra|ds|vt|ew|cy|pu|ga|ha|hb|rc|rf|pg|st|rz)[0-9]{1,}', + # filename types used for both topic curation status AND workflow status + 'cam_full' => '(ma|sb|rd|al|sm|cm|pm|gm|lp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?', # publications that should have had full curation of both any genetic reagent data AND any phenotype data @@ -719,3 +790,144 @@ o $audit_timestamp is a timestamp from the audit_chado table. The $audit_timesta return $data; } + + + + + +sub get_all_pub_internal_notes_for_tet_wf { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_all_pub_internal_notes_for_tet_wf + Usage: get_all_pub_internal_notes_for_tet_wf(database_handle); + Function: Gets all publication level internal notes that we want to try to submit to the Alliance ABC in either the topic-entity-tg (tet) editor or the workflow editor, additionally filtered using the regexes in the array reference provided in the second argument. + Example: my $all_pub_internal_notes_for_tet_wf = &get_all_pub_internal_notes_for_tet_wf($dbh, $additional_filters); + +Arguments: + +o database handle + +o $additional_filters - reference to an array of additional regular expressions to be used to filter the internal notes before they are returned. (This array reference can be empty). + +Returns: + +The returned hash reference has the following structure: + +@{$data->{$pub_id}->{$line}}, $audit_timestamp; + + +o $pub_id is the pub_id of the reference. + +o $line is the complete internal note; if the internal note contains multiple lines they will be present in the returned $line, so this may need to be converted to spaces when submitting to the ABC if you want the note to appear as a single line. + +o $audit_timestamp is the 'I' timestamp(s) from the audit_chado table for the matching $line. The $audit_timestamp values are sorted earliest to latest within the array. + + +The subroutine only returns internal notes that we want to end up in either the topic-entity-tg (tet) editor or the workflow editor in the Alliance ABC. Internal notes of two kinds are thus removed from returned output by default, using the regular expressions in the following array references: + + +o $ignore - internal notes that are related to the bibliographic data and are no longer relevant. + +o $email - email address information (either community curation or the submitter of a personal communication) - these will be submitted elsewhere in the ABC. + + +In addition, any regular expressions passed via the $additional_filters array reference are also used to filter the returned output. Note that the regexes are used to remove matching text from each internal note; any remaining text after the substitutions is returned. This strategy is necessary as in some cases, a single internal note contains multiple lines that may need different treatment when being exported to the ABC. For example, some internal notes in FB are not being submitted as free text 'notes' to the ABC, but are instead being turned into an ATP term representing a topic, topic status or a controlled tag; in these cases, the regexes in $additional_filters can be used to remove the relevant internal notes so that they are not then redundantly also submitted to the ABC as a free text note. + +This means that any regex in $additional_filters should typically be specified to either remove a *complete* line (using ^ and $ in the regex), or be sufficiently specific that it is safe to remove the text when it is part of a line (e.g. similar to the email regexes in the $email array reference below). + +(It isn't possible to just split on newline to separate out each 'row' of a given 'multiple lines' internal note, because in some cases, the multiple lines correspond to a paragraph (i.e. the multiple lines should stay together when submitted to the ABC) but in others, each row represents a different kind of internal note that may need to be split up and treated differently as they are submitted to the ABC (e.g. those that are being converted into an ATP term as described above). + + + +=cut + + unless (@_ == 2) { + + die "Wrong number of parameters passed to the get_all_pub_internal_notes_for_tet_wf subroutine\n"; + } + + + my ($dbh, $additional_filters) = @_; + + my $data = {}; + + + my $ignore = [ + + 'Automated trawl for PMID, ggrumbli 090330', + 'BIOSIS Gene Name field: .+', + '^[?:]?Supplement:.+$', # there are some cases with ? or : at the beginning of the Supplement: line + ]; + + + my $email = [ + + '(Author|User|Submitter|submitter): ?.*?\?$', + '(Author|User|Submitter|submitter): ?.*?\? [a-z]{2}[0-9]{1,}\.$', + '(Author|User|Submitter|submitter): ?.*?\?$', + '(Author|User|Submitter|submitter): ?.*?\?$', + '(Author|User|Submitter|submitter) .*?\?\. ?[a-z]{2}[0-9]{1,}\.$', + + # remove email info from beginning of line - require <> around email part of regex else is not strict enough + '(Author|User|Submitter|submitter): .*?\<\S+?@\S+?\>\. ', + + '^(Author|User|Submitter|submitter):$', + + + + ]; + + + my $sql_query = sprintf("select distinct pp.pub_id, pp.value, ac.transaction_timestamp from pubprop pp, cvterm c, audit_chado ac where c.cvterm_id=pp.type_id and c.name ='internalnotes' and ac.audited_table='pubprop' and ac.audit_transaction = 'I' and pp.pubprop_id=ac.record_pkey order by ac.transaction_timestamp"); + my $db_query = $dbh->prepare($sql_query); + $db_query->execute or die "WARNING: ERROR: Unable to execute get_timestamps_for_pubprop_value query ($!)\n"; + + + while (my ($pub_id, $value, $audit_timestamp) = $db_query->fetchrow_array) { + + + foreach my $regex (@{$ignore}) { + + $value =~ s/$regex//mg; + + + } + + foreach my $regex (@{$email}) { + + $value =~ s/$regex//mg; + + + } + + foreach my $regex (@{$additional_filters}) { + + $value =~ s/$regex//mg; + + + } + + + $value =~ s/^ +//; + $value =~ s/ +$//; + $value =~ s/^\n+//; + $value =~ s/\n+$//; + + $value =~ s/\t//g; + + if ($value ne '') { + + push @{$data->{$pub_id}->{$value}}, $audit_timestamp; + + } + } + + return $data; + +} + diff --git a/lib/Util.pm b/lib/Util.pm old mode 100644 new mode 100755 index 19711d1..25e558a --- a/lib/Util.pm +++ b/lib/Util.pm @@ -5,7 +5,7 @@ use warnings; require Exporter; our @ISA = qw(Exporter); -our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data); +our @EXPORT = qw(make_abc_json_metadata get_yyyymmdd_date_stamp pub_has_curated_data get_relevant_curator_from_candidate_list get_relevant_curator_from_candidate_list_using_pub_and_timestamp check_and_validate_nocur clean_note); use JSON::PP; @@ -195,6 +195,9 @@ o $data_type is the type of curated data. The value must be present as a key in ], +# the list of FBid types used in the query is deliberately strict, so that only publications that are about a gene or that use fly lines count as 'has genetic data' to make sure that the manual indexing status and cross-checks with nocur work correctly + 'genetic_data' => ['select distinct p.pub_id from pub p, feature f, feature_pub fp where p.is_obsolete = \'f\' and p.pub_id = fp.pub_id and f.feature_id = fp.feature_id and f.is_obsolete = \'f\' and f.uniquename ~\'^FB(gn|al|ab|ba|ti|te)\'', + ], }; @@ -222,4 +225,497 @@ o $data_type is the type of curated data. The value must be present as a key in } +sub get_relevant_curator_from_candidate_list { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_relevant_curator_from_candidate_list + Usage: get_relevant_curator_from_candidate_list(candidate_list,pub_id); + Function: Gets relevant curator details for a pub_id specified in the second argument, from the candidate_list provided in the first argument. + Example: my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id); + +Arguments: + +candidate_list contains timestamp, curator name and curation record information for curation records of a particular type, keyed on pub_ids. + +candidate_list needs to be a data structure with the following general format, which can be generated by using the get_relevant_currec_for_datatype subroutine (this returns two references that can first be put into the candidate_list->{"by_timestamp"} and candidate_list->{"by_curator"} halves of the data structure before it is passed to get_relevant_curator_from_candidate_list. + +o $candidate_list->{"by_timestamp"}->{pub_id} + + o each pub_id refers to an array of the timestamps associated with the pub_id for the curation records, ordered by date. + +o $candidate_list->{"by_curator"}->{pub_id}->{curator}->{timestamp}->{record_number} + + + +pub_id is the pub_id of a single reference. + +Returns: + + +o if the pub_id is NOT a key in $candidate_list->{"by_timestamp"}, returns undef (i.e. there is NO curation of this particular curation type for this pub_id). + +o if the pub_id IS a key in $candidate_list->{"by_timestamp"}, returns information for the *earliest* curation record from the candidate list (getting the relevant details from $candidate_list->{"by_curator"}). + + + o $curator_details->{curator} = curator name + o $curator_details->{timestamp} = timestamp + o $curator_details->{currecs} = curation record filename - useful for debugging + +If there is more than one 'relevant' curator for the *earliest* timestamp (i.e. if multiple curation records of the same type of curation were submitted for the same pub_id in the same data load) then: + +o curator is set as follows: + + o if any of the relevant curators are FB curators - set to 'FB_curator' + + o otherwise, if any of the relevant curators are community curation - set to 'User Submission' + + o otherwise - set to 'ERROR:unable to reconcile curator' so can easily be identified as an error. + +o currecs is set to 'multiple curators for same timestamp' + + +=cut + + + unless (@_ == 2) { + + die "Wrong number of parameters passed to the get_relevant_curator_from_candidate_list subroutine\n"; + } + + + my ($data, $pub_id) = @_; + + my $curator_details = undef; + + + + if (exists $data->{"by_timestamp"}->{$pub_id}) { + + + # get the timestamp of the *earliest* matching curation record + my $timestamp = $data->{"by_timestamp"}->{$pub_id}[0]; + + # try to determine the relevant curator if appropriate for the topic + my $relevant_curator = ''; + my $relevant_records = ''; + + + if (exists $data->{"by_curator"}->{$pub_id}) { + + if (scalar keys %{$data->{"by_curator"}->{$pub_id}} == 1) { + my $curator_candidate = join '', keys %{$data->{"by_curator"}->{$pub_id}}; + + if (exists $data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$timestamp}) { + $relevant_curator = $curator_candidate; + $relevant_records = join ' ', sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$timestamp}}; + + } + } else { + + my $count = 0; + my $curator_count = 0; + my $community_curation_count = 0; + foreach my $curator_candidate (sort keys %{$data->{"by_curator"}->{$pub_id}}) { + + foreach my $candidate_timestamp (sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}}) { + + if ($candidate_timestamp eq $timestamp) { + + $relevant_curator = $curator_candidate; + $relevant_records = join ' ', sort keys %{$data->{"by_curator"}->{$pub_id}->{$curator_candidate}->{$candidate_timestamp}}; + + unless ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission' || $relevant_curator eq 'UniProtKB') { + + $curator_count++; + } + + if ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission') { + + $community_curation_count++; + } + $count++; + + } + } + } + + unless ($count == 1) { + if ($curator_count) { + $relevant_curator = 'FB_curator'; + + } elsif ($community_curation_count) { + + $relevant_curator = 'User Submission'; + + } else { + + $relevant_curator = 'ERROR:unable to reconcile curator'; + } + $relevant_records = 'multiple curators for same timestamp'; + + } + } + } + + if ($relevant_curator) { + + # convert all unknown style curators to the same 'FB_curator' name that is used for persistent store submissions + if ($relevant_curator eq 'Unknown' || $relevant_curator eq 'Unknown Curator' || $relevant_curator eq 'Generic Curator' || $relevant_curator eq 'P. Leyland') { + + $relevant_curator = 'FB_curator'; + + } + + + $curator_details->{curator} = $relevant_curator; + $curator_details->{timestamp} = $timestamp; + $curator_details->{currecs} = $relevant_records; # useful for debugging + + } + + + } + + return $curator_details; +} + + + +sub get_relevant_curator_from_candidate_list_using_pub_and_timestamp { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: get_relevant_curator_from_candidate_list_using_pub_and_timestamp + Usage: get_relevant_curator_from_candidate_list_using_pub_and_timestamp(candidate_list, pub_id of reference, timestamp); + Function: Extracts curator and corresponding curation_record_filename information from the candidate_list specified in the first argument, where it matches the specified pub_id (second argument) plus timestamp (third argument) combination. + Example: my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $nocur_timestamp); + +Arguments: + +o candidate_list needs to be a data structure with the following general format (a data structure with the correct format that contains ALL curator related information can be generated by using the get_all_currec_data subroutine that is in lib/AuditTable.pm). + + o $candidate_list->{pub_id}->{timestamp}->{curator}->{curation_record_filename} + + +o pub_id is the pub_id of the single reference to be matched. + +o timestamp is the timestamp to be matched. + +Returns: + + +o if there is no matching curator information for the pub_id+timestamp combination, returns undef. + +o if there is a single matching curator for the pub_id+timestamp combination, returns the following: + + + o $curator_details->{curator} = curator name + o $curator_details->{currecs} = curation record filename(s) - useful for debugging + + +o If there is *more than one* matching curator for the pub_id+timestamp combination, then instead of a returning an individual curator name, a more generic curator name is returned if possible, using the following logic: + + o if any of the relevant curators are FB curators - $curator_details->{curator} is set to 'FB_curator' + + o otherwise, if any of the relevant curators represent community curation - $curator_details->{curator} is set to 'User Submission' (the more general 'curator' for community curation). + + o otherwise - $curator_details->{curator} is set to 'ERROR:unable to reconcile curator' so can easily be identified as a potential error when debugging. + + o In addition, $curator_details->{currecs} is set to 'multiple curators for same timestamp' to help with debugging. + + +=cut + + + unless (@_ == 3) { + + die "Wrong number of parameters passed to the get_relevant_curator_from_candidate_list_using_pub_and_timestamp subroutine\n"; + } + + + my ($data, $pub_id, $timestamp) = @_; + + my $curator_details = undef; + + if (exists $data->{$pub_id} && exists $data->{$pub_id}->{$timestamp}) { + + my $relevant_curator = ''; + my $relevant_records = ''; + + # single matching curator + if (scalar keys %{$data->{$pub_id}->{$timestamp}} == 1) { + $relevant_curator = join '', keys %{$data->{$pub_id}->{$timestamp}}; + $relevant_records = join ' ', sort keys %{$data->{$pub_id}->{$timestamp}->{$relevant_curator}}; + + # multiple matching curators + } else { + + my $fb_curator_count = 0; # count of number of matching FB curators + my $community_curation_count = 0; # count of number of matching types of community curation + foreach my $curator_candidate (sort keys %{$data->{$pub_id}->{$timestamp}}) { + + $relevant_curator = $curator_candidate; + $relevant_records = join ' ', sort keys %{$data->{$pub_id}->{$timestamp}->{$curator_candidate}}; + + unless ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission' || $relevant_curator eq 'UniProtKB') { + + $fb_curator_count++; + } + + if ($relevant_curator eq 'Author Submission' || $relevant_curator eq 'User Submission') { + + $community_curation_count++; + } + + } + + if ($fb_curator_count) { + $relevant_curator = 'FB_curator'; + + } elsif ($community_curation_count) { + + $relevant_curator = 'User Submission'; + + } else { + + $relevant_curator = 'ERROR:unable to reconcile curator'; + } + $relevant_records = 'multiple curators for same timestamp'; + + } + + + if ($relevant_curator) { + + + # convert all unknown style curators to the same 'FB_curator' name that is used for persistent store submissions + if ($relevant_curator eq 'Unknown' || $relevant_curator eq 'Unknown Curator' || $relevant_curator eq 'Generic Curator' || $relevant_curator eq 'P. Leyland') { + + $relevant_curator = 'FB_curator'; + + } + + + $curator_details->{curator} = $relevant_curator; + $curator_details->{currecs} = $relevant_records; # useful for debugging + + } + + + } + + return $curator_details; +} + + +sub check_and_validate_nocur { + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: check_and_validate_nocur + Function: Checks whether a publication has been marked as a 'nocur' (contains no genetic data), and validates whether that flagging is consistent with the types of entity attached to the publication in FlyBase. Returns information based on this checking and validation. + Example: my ($nocur_status, $nocur_timestamp, $nocur_note) = &check_and_validate_nocur($nocur_flags, $has_genetic_data, $pub_id); + +Arguments: + +o $nocur_flags - hash reference containing nocur/nocur_abs flag information + +o $has_genetic_data - list of all publications that have genetic entities attached to them. + +o $pub_id is the pub_id of the single publication to be checked. + +Returns: + + +$nocur_status is a boolean. It is only set to 1 if the publication being checked IS a nocur (has a nocur/nocur_abs flag in FlyBase) AND passed the validation (there are NO genetic entities attached to it). + +(i.e. if a publication has a nocur/nocur_abs flag in FlyBase but there ARE genetic entities attached to the publication, $nocur_status = 0). + + +$nocur_timestamp: timestamp of when the nocur/nocur_abs flag was added (if the flag exists). + +$nocur_note: free text note that can be added in the 'note' slot of the relevant ATP workflow_tag item. + + o contains 'Only looked at abstract.' for publications with a nocur_abs flag in FlyBase. + + +=cut + + + + unless (@_ == 3) { + + die "Wrong number of parameters passed to the check_and_validate_nocur subroutine\n"; + } + + my ($nocur_flags, $has_genetic_data, $pub_id) = @_; + + + my $nocur_status = 0; + my $nocur_timestamp = ''; + my $note = ''; + + if (exists $nocur_flags->{$pub_id}) { + + # if there is curated genetic data then ignore any nocur flag as it must be an error + unless (exists $has_genetic_data->{$pub_id}) { + $nocur_status = 1; + } + + if (exists $nocur_flags->{$pub_id}->{nocur}) { + $nocur_timestamp = $nocur_flags->{$pub_id}->{nocur}[0]; + + } else { + + $nocur_timestamp = $nocur_flags->{$pub_id}->{nocur_abs}[0]; + + $note = "Only looked at abstract."; + } + } + + return ($nocur_status, $nocur_timestamp, $note); + + +} + + + + +sub clean_note { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: clean_note + Function: Subroutine that 'cleans' text that is being submitted to the Alliance ABC as an internal 'note'. + Example: &clean_note($value); + +The conversions done are: + +- removes any tabs + +- converts superscripts/subscripts to curator-friendly form + + +- uses degreek subroutine to convert any greek symbols in FB 'sgml' format to spelt out greek name. + + +Does NOT substitute returns (because many of the internal notes being submitted are in multiline format which needs to be preserved). + + +=cut + unless (@_ == 1) { + + die "Wrong number of parameters passed to the clean_free_text subroutine\n"; + } + my ($string) = @_; + + if ($string =~ m/\t/) { + $string =~ s/\t/ /g; + } + + $string =~ s/\<\/down\>/\]\]/g; + $string =~ s/\/\[\[/g; + $string =~ s/\/\[/g; + $string =~ s/\<\/up\>/\]/g; + + $string = °reek($string); + + return $string; + +} + + + +sub degreek { + + +=head1 SUBROUTINE: +=cut + +=head1 + + Title: degreek + Function: Subroutine that converts sgml greek format to spelt out greek name. + + +=cut + + + my $symbol = $_[0]; + + my %greek = ( + '&Agr;' => 'Alpha', + '&Bgr;' => 'Beta', + '&Ggr;' => 'Gamma', + '&Dgr;' => 'Delta', + '&Egr;' => 'Epsilon', + '&Zgr;' => 'Zeta', + '&EEgr;' => 'Eta', + '&THgr;' => 'Theta', + '&Igr;' => 'Iota', + '&Kgr;' => 'Kappa', + '&Lgr;' => 'Lambda', + '&Mgr;' => 'Mu', + '&Ngr;' => 'Nu', + '&Xgr;' => 'Xi', + '&Ogr;' => 'Omicron', + '&Pgr;' => 'Pi', + '&Rgr;' => 'Rho', + '&Sgr;' => 'Sigma', + '&Tgr;' => 'Tau', + '&Ugr;' => 'Upsilon', + '&PHgr;' => 'Phi', + '&KHgr;' => 'Chi', + '&PSgr;' => 'Psi', + '&OHgr;' => 'Omega', + '&agr;' => 'alpha', + '&bgr;' => 'beta', + '&ggr;' => 'gamma', + '&dgr;' => 'delta', + '&egr;' => 'epsilon', + '&zgr;' => 'zeta', + '&eegr;' => 'eta', + '&thgr;' => 'theta', + '&igr;' => 'iota', + '&kgr;' => 'kappa', + '&lgr;' => 'lambda', + '&mgr;' => 'mu', + '&ngr;' => 'nu', + '&xgr;' => 'xi', + '&ogr;' => 'omicron', + '&pgr;' => 'pi', + '&rgr;' => 'rho', + '&sgr;' => 'sigma', + '&tgr;' => 'tau', + '&ugr;' => 'upsilon', + '&phgr;' => 'phi', + '&khgr;' => 'chi', + '&psgr;' => 'psi', + '&ohgr;' => 'omega', + ); + + + $symbol =~ s/(&[a-z]{1,2}gr;)/$greek{$1}?"$greek{$1}":"$1"/egi; + + $symbol =~ s/∩/INTERSECTION/g; # conversion needed to make 'plain' symbol for any FBco symbols in note. + + return $symbol; + +} diff --git a/populate_workflow_status.pl b/populate_workflow_status.pl new file mode 100644 index 0000000..18b8464 --- /dev/null +++ b/populate_workflow_status.pl @@ -0,0 +1,939 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +use DBI; +use JSON::PP; + +use File::Basename; +use File::Spec; + +# add lib sub-folder containing modules into @INC +use lib File::Spec->catdir(File::Basename::dirname(File::Spec->rel2abs($0)), 'lib'); + +# add modules from lib-subfolder +use AuditTable; +use ABCInfo; +use Util; +use Mappings; + +use Data::Dumper; +$Data::Dumper::Sortkeys = 1; + +use Encode; +binmode(STDOUT, ":utf8"); + +=head1 NAME populate_workflow_status.pl + + +=head1 SYNOPSIS + +Used to load FlyBase curation status information into the Alliance ABC literature database for those types of curation that are stored in the ABC in the 'workflow_tag' table, because they are curation types which are relevant to the publication workflow (how a publication 'moves' through the various automated/manual curation processes at a MOD). Generates an output file containing a single json structure for all the data (the workflow_tag status data is a set of arrays within a 'data' object, plus there is a 'metaData' object to indicate source and the API to use to submit the data into the ABC). + +=cut + + +=head1 USAGE + +USAGE: perl populate_workflow_status.pl pg_server db_name pg_username pg_password dev|test|production + + +=cut + + + +=head1 DESCRIPTION + +Types of curation that are relevant to the publication workflow (and thus stored in the Alliance in 'workflow_tag') include those that are: + +o important for deciding the next step in the publication workflow process (manual or automated) + + o e.g. if a paper has been community curation via FTYP, there is no need for a FB curator to do first-pass ('skim') curation. + +o a type of curation that must have been completed before another type of curation can occur + + o e.g. a paper must have been 'thin' curated (Alliance name: 'manual_indexing') before curation of phenotypic or DO data can occur. + +Three types of FB curation are mapped to the appropriate Alliance information by this script: + +o community curation + +o first pass curation by a biocurator ('skim' curation at FB) + +o manual indexing ('thin' curation at FB) + + +(NB: In the Alliance, curation status for individual datatypes (Alliance name: 'topics') are stored in 'curation_status', NOT 'workflow_tag', so FB curation status info for these types of curation (e.g. phenotype, physical interactions) are not dealt with by this script, but instead by populate_topic_curation_status.pl). + +Script logic: + + +o Uses FB 'curated_by' pubprop information to determine the curation status of the three FB curation types being mapped to workflow_tag. Sets curation status to 'done' when a file of the standard expected filename format is found for a given curation type. + +o Uses the 'nocur' flag to identify papers that contain 'no genetic information'. Validates that the nocur flag is correct and then sets manual_indexing status to 'won't curate' (with 'no genetic information' curation_tag), overriding any 'done' status added in the first step above. + +o Identifies papers have not yet been manually indexed, but which contain high-priority data. Sets curation status to 'curation needed' for manual indexing, with a note explaining why the paper is high priority. + + +o Adds publication-level internal notes to the 'note' of the appropriate workflow_tag curation type + + o first filters out internal notes that are either not being submitted to the Alliance or will be submitted in a different script (e.g. attached either to a topic or a topic curation status). + + o uses the internal note timestamp to identify which of the three workflow_tag curation types to add the internal note to. + + o For any internal notes where the timestamp did not match any of the workflow_tag timestamps for that publication (can happen if the note was added as an edit record), add it to the manual indexing status (if that exists), or then the first-pass curation status (if that exists). + + o Any internal notes that have not been matched up and added in the above steps are printed in the FB_workflow_status_data_errors.err file. + + +Script has three modes: + +o dev mode + + o single FBrf mode: asks user for FBrf number (can also use a regular expression to test multiple FBrfs in this mode). + + o Data is printed to both the json output and 'plain' output files. + + o makes error files (see below) to record any errors. + + +o test mode + + o single FBrf mode: asks user for FBrf number (must be a single FBrf number). + + o uses curl to try to POST data to the Alliance ABC stage server (so asks user for okta token for Alliance ABC stage server). + + o Data (including a record of successful curl POST events) is printed to the 'plain' output file. + + o makes error files (see below) to record any errors. + + +o production mode + + o makes data for all relevant FBrfs in chado. + + o Data is printed to the json output file. + + o makes error files (see below) to record any errors. + + +o Output files + + + o json output file (FB_workflow_status_data.json) containing a single json structure for all the data (manual_indexing status data is a set of arrays within a 'data' object, plus there is a 'metaData' object to indicate source). Data is printed to this file in all modes except 'test'. + + o 'plain' output file (FB_workflow_status_data.txt) to aid in debugging - prints the same data as in the json file, but with a single 'DATA:' tsv row for each FBrf+topic combination. Data is printed to this file in all modes. + + o FB_workflow_status_data_errors.err - errors in mapping FlyBase data to appropriate Alliance json are printed in this file. Data is printed to this file in all modes. + + o FB_workflow_status_process_errors.err - processing errors - if a curl POST fails in test mode, the failed json element and the reason for the failure are printed in this file. Expected to be empty for all other modes. + +=cut + + +if (@ARGV != 6) { + warn "Wrong number of arguments, should be 6!\n"; + warn "\n USAGE: $0 pg_server db_name pg_username pg_password dev|test|production access_token\n\n"; + warn "\teg: $0 flysql24 production_chado zhou pwd dev|test|production ABCD1234\n\n"; + exit; +} + +my $server = shift(@ARGV); +my $db = shift(@ARGV); +my $user = shift(@ARGV); +my $pwd = shift(@ARGV); +my $ENV_STATE = shift(@ARGV); +my $access_token = shift(@ARGV); + +unless ($ENV_STATE eq 'dev'|| $ENV_STATE eq 'test'|| $ENV_STATE eq 'production') { + + warn "Unknown state '$ENV_STATE': must be 'dev', 'test' or 'production'\n\n"; + exit; + +} + +my $test_FBrf = ''; + +# variable that specifies the appropriate path (downstream of the base URL) to use for the Alliance Literature Service API. +# Used to add an element in the metaData object of the output json file and in the curl command used when in test mode. +my $api_endpoint = 'workflow_tag'; + + + +if ($ENV_STATE eq "test") { + + print STDERR "You are about to write data to the stage Alliance literature server\n"; + print STDERR "Type y to continue else anything else to stop:\n"; + + my $continue = ; + chomp $continue; + if (($continue eq 'y') || ($continue eq 'Y')) { + print STDERR "Processing will continue.\n"; + } else { + die "Processing has been cancelled.\n"; + + } + +} + +if ($ENV_STATE eq "dev" || $ENV_STATE eq "test") { + + print STDERR "FBrf to test:"; + $test_FBrf = ; + chomp $test_FBrf; + + if ($ENV_STATE eq "test") { + + unless ($test_FBrf =~ m/^FBrf[0-9]{7}$/) { + die "Only a single FBrf is allowed in test mode.\n"; + } + } +} else { + + $test_FBrf = '^FBrf[0-9]+$'; + +} + +my $dsource = sprintf("dbi:Pg:dbname=%s;host=%s;port=5432",$db,$server); +my $dbh = DBI->connect($dsource,$user,$pwd) or die "cannot connect to $dsource\n"; + +my $json_encoder = JSON::PP->new()->pretty(1)->canonical(1); + + + +my $workflow_tag_mapping = { + + # community curation + '0_user' => { + 'finished_status' => 'ATP:0000234', # community curation finished + 'relevant_record_type' => ['user'], + }, + + # first pass curation + '1_skim' => { + 'finished_status' => 'ATP:0000330', # first pass curation finished + 'relevant_record_type' => ['skim'], + 'second_pass' => '1', + + }, + + # manual indexing + '2_manual_indexing' => { + 'finished_status' => 'ATP:0000275', # manual indexing complete + 'relevant_record_type' => ['thin', 'cam_full', 'gene_full', 'cam_no_suffix'], # use an array so can go through the types in this order when assigning manual indexing status + 'nocur_override' => 'ATP:0000343', # won't manually index + 'pubtype_filter' => { + 'cam_no_suffix' => { + 'review' => '1', + }, + }, + 'high_priority_override' => 'ATP:0000274', # manual indexing needed + 'second_pass' => '1', + + }, + +}; + + +# structure of the required json for each workflow_tag element +#{ +# "date_created": $timestamp, +# "date_updated": $timestamp, +# "created_by": $curator, +# "updated_by": $curator, +# "mod_abbreviation": "FB", +# "reference_curie": FB:$FBrf, +# "workflow_tag_id": $ATP, # this is the ATP term that describes the status of the particular type of curation +# "curation_tag": # this is an ATP ID is for the 'controlled_note' - most are negative +# "note": # this is a free text note +#} + + +# open output and error logging files + +open my $json_output_file, '>', "FB_workflow_status_data.json" + or die "Can't open json output file ($!)\n"; +binmode($json_output_file, ":utf8"); + +open my $data_error_file, '>', "FB_workflow_status_data_errors.err" + or die "Can't open data error logging file ($!)\n"; +binmode($data_error_file, ":utf8"); + +open my $process_error_file, '>', "FB_workflow_status_process_errors.err" + or die "Can't open processing error logging file ($!)\n"; +binmode($process_error_file, ":utf8"); + + +open my $plain_output_file, '>', "FB_workflow_status_data.txt" + or die "Can't open plain output file ($!)\n"; +binmode($plain_output_file, ":utf8"); + + +print STDERR "##Starting processing: " . (scalar localtime) . "\n"; + +my $all_curation_record_data = &get_all_currec_data($dbh); + + +## get relevant data for deciding curation status for the various type of workflow_tag curation + +my $fb_data = {}; + +foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) { + + foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) { + + ($fb_data->{"$relevant_record_type"}->{"by_timestamp"}, $fb_data->{"$relevant_record_type"}->{"by_curator"}) = &get_relevant_currec_for_datatype($dbh,$relevant_record_type); + + } +} + + +#print Dumper ($fb_data); + + +## get data to assign 'no genetic data' curation tag and associated 'won't curate' workflow_tag term +my $nocur_flags = &get_matching_pubprop_value_with_timestamps($dbh,'cam_flag','^nocur|nocur_abs$'); + +## get publications that *do* have links to genetic objects (used for validation) +my $has_genetic_data = &pub_has_curated_data($dbh, 'genetic_data'); + +## get diseaseHP flags so can assign manual indexing 'needs curation' where appropriate +my $high_priority_flags = {}; + +$high_priority_flags->{'diseaseHP_dis_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'dis_flag','^diseaseHP$'); + +$high_priority_flags->{'diseaseHP_harv_flag'} = &get_matching_pubprop_value_with_timestamps($dbh,'harv_flag','^diseaseHP$'); + +$high_priority_flags->{'wt_exp_needs_curation'} = &get_matching_pubprop_value_with_timestamps($dbh,'harv_flag','^wt_exp::Needs cam curation$'); + + +my $high_priority_mapping = { + + + 'diseaseHP_dis_flag' => 'diseaseHP', + 'diseaseHP_harv_flag' => 'diseaseHP', + 'wt_exp_needs_curation' => 'wt_exp::Needs cam curation', + + + +}; + +#print Dumper ($high_priority_flags); + +my $additional_filters = [ + + # Use simple regex to remove *all* 'Dataset:' lines - will be converted to a topic and/or associated free text note in another script + # The commented out lines are the regexes needed to identify the 'Dataset: pheno' lines for the topic scripts + #'^Dataset: pheno\.?$', + #'^Dataset: pheno\. ?-?[a-z]{1,} ?[0-9]{6}(\.)?$', + #'^Dataset: pheno\. [0-9]{6}[a-z]{1,}\.$', + '^(D|d)ataset:.+$', + + # filters to remove lines that will be converted to a topic *status* and/or associated free text note in another script + '^HDM flag not applicable\.?( *[a-z]{1,}[0-9]{6})?$', + '^phen curation: only pheno_chem data in paper\.( *[a-z]{2}[0-9]{6}\.?)?$', + '^phen curation: No phenotypic data in paper\.( *[a-z]{2}[0-9]{6}\.?)?$', + '^phen_cur: CV annotations only(\. *[a-z]{2}[0-9]{6}\.?)?$', + '^phys_int not curated.+$', + '^FTA: DOcur genotype - need to check for missing drivers$', + + # filters to remove lines that will be converted to a free text note attached to a topic in another script + '^The phys_int flag inferred from.+$', + '^The phys_int flag is inferred from.+$', + '^The phys_int flag was inferred.+$', + '^FTYP cell line:.+$', + + # filter to remove lines that would be better converted into a note when submit curation record filename info. + '^Curation record .*? is to add the allele phendesc data originally curated in .+$', + '^Curation record .*? is to fix data for MI4 .+$', + '^Curation record .*? generated by hand to fix MI4 ticket.+$', + + # filter to remove preliminary data that will not be submitted to the Alliance + '^HDM flag future.+$', + + +]; + +## get any relevant internal notes to be added to the free text note slot of a workflow_tag element - first filtering out any internal notes that do not need to be added by this script (they will instead either be converted into an ATP term corresponding to a topic or controlled note in the Alliance, or added as a free text note to a specific topic), using the regexes specified in $additional_filters. +my $all_candidate_internal_notes = &get_all_pub_internal_notes_for_tet_wf($dbh, $additional_filters); + +## then remove any internal notes that come from curation records for a particular topic - these will be added as a note to the curation status of the *topic* in another script, rather than being added to the more general workflow_tag categories in this script. +# any remaining internal notes will be those either submitted under one of the workflow_tag types for this script (identified later through matching timestamp and curation record filename information) or under an 'edit' record - the latter will be added to the most appropriate workflow_tag with a note indicating it was an edit record +my @topic_record_types = ('cell_line', 'phys_int', 'DO', 'neur_exp', 'wt_exp', 'chemical', 'args', 'phen', 'humanhealth'); + +foreach my $topic_record_type (@topic_record_types) { + + my (undef, $curation_record_data) = &get_relevant_currec_for_datatype($dbh,$topic_record_type); + + + foreach my $pub_id (keys %{$all_candidate_internal_notes}) { + + foreach my $int_note (keys %{$all_candidate_internal_notes->{$pub_id}}) { + + foreach my $timestamp (@{$all_candidate_internal_notes->{$pub_id}->{$int_note}}) { + my $int_note_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); + + if (defined $int_note_details && $int_note_details->{currecs} ne 'multiple curators for same timestamp') { + + if (exists $curation_record_data->{$pub_id}->{"$int_note_details->{curator}"} && exists $curation_record_data->{$pub_id}->{"$int_note_details->{curator}"}->{$timestamp}->{"$int_note_details->{currecs}"}) { + + delete $all_candidate_internal_notes->{$pub_id}->{$int_note}; + next; + } + } + + + } + } +} + + + +} + + +## Get list of publications: restrict to the type of pubs where it is useful to export workflow status info (same types as used in populate_topic_curation_status.pl) +my $pub_id_to_FBrf = {}; +my $sql_query = sprintf("select p.uniquename, p.pub_id, cvt.name from pub p, cvterm cvt where p.is_obsolete = 'f' and p.type_id = cvt.cvterm_id and cvt.is_obsolete = '0' and cvt.name in ('paper', 'erratum', 'letter', 'note', 'teaching note', 'supplementary material', 'retraction', 'personal communication to FlyBase', 'review') and p.uniquename ~'%s'", $test_FBrf); + +my $db_query= $dbh->prepare ($sql_query); +$db_query->execute or die" CAN'T GET FBrf FROM CHADO:\n$sql_query)\n"; +while (my ($uniquename, $pub_id, $pub_type) = $db_query->fetchrow_array()) { + $pub_id_to_FBrf->{$pub_id}->{'FBrf'} = $uniquename; + $pub_id_to_FBrf->{$pub_id}->{'type'} = $pub_type; +} + + +my $workflow_status_data = {}; + + +foreach my $pub_id (sort keys %{$pub_id_to_FBrf}) { + + my $FBrf = $pub_id_to_FBrf->{$pub_id}->{'FBrf'}; + my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'}; + + my ($nocur_status, $nocur_timestamp, $nocur_note) = &check_and_validate_nocur($nocur_flags, $has_genetic_data, $pub_id); + + + foreach my $workflow_type (sort keys %{$workflow_tag_mapping}) { + + + foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) { + + if (exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'} && exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'}->{$relevant_record_type}) { + + + unless (exists $workflow_tag_mapping->{$workflow_type}->{'pubtype_filter'}->{$relevant_record_type}->{$pub_type}) { + + next; + + } + } + + unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { + + my $curator_details = &get_relevant_curator_from_candidate_list($fb_data->{"$relevant_record_type"}, $pub_id); + + # if there is a matching record for the workflow type, store the information for submitting data to the Alliance + if (defined $curator_details) { + + + my $ATP = $workflow_tag_mapping->{$workflow_type}->{'finished_status'}; + my $curation_tag = ''; + my $note = ''; + # set values based on matching record (overriden in a few edge cases below) + my $curator = "$curator_details->{curator}"; + my $timestamp = "$curator_details->{timestamp}"; + my $curation_records = "$curator_details->{currecs}"; + my $debugging_note = ''; + + + + # for manual indexing, use any nocur information to override the workflow type (to the 'won't curate' style term) + # and add the appropriate 'no genetic data' curation_tag where appropriate + if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { + + if ($nocur_status == 1) { + $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; + $curation_tag = "ATP:0000207"; # no genetic data + $note = "$nocur_note"; + + unless ($timestamp eq $nocur_timestamp) { + $timestamp = "$nocur_timestamp"; + $debugging_note = "timestamp mismatch: curation record info overwritten by nocur flag info"; + + my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); + + if (defined $nocur_details) { + + $curator = "$nocur_details->{curator}"; + $curation_records = "$nocur_details->{currecs}"; + + } else { + $curator = 'FB_curator'; + $curation_records = "WARNING: unable to get curator details for nocur flag"; + + } + + + } + + } + + + } + + + # build reference with information for this publication+workflow type combination + + my $FBrf_with_prefix="FB:".$FBrf; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP; + + if ($curation_tag) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag; + } + + + if ($note) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note; + } + + + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = $relevant_record_type; + + if ($debugging_note) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + } + + } + + } + } + + # once been through all the curation record types for each workflow_type, see if there is additional information that can be added via nocur flag + if (exists $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}) { + + unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { + + if ($nocur_status == 1) { + + my $curator = 'FB_curator'; # default that is overriden with more specific data later where possible + my $timestamp = "$nocur_timestamp"; + my $curation_records = 'WARNING: unable to get curator details for nocur flag'; # default that is overriden with more specific data later where possible + my $ATP = $workflow_tag_mapping->{$workflow_type}->{'nocur_override'}; + my $curation_tag = "ATP:0000207"; # no genetic data + my $note = "$nocur_note"; + my $debugging_note = ''; + + # get curator details for the nocur flag and use to override defaults where possible + my $nocur_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id,$timestamp); + + if (defined $nocur_details) { + + $curator = "$nocur_details->{curator}"; + $curation_records = "$nocur_details->{currecs}"; + + } + + # override edge cases where a curator added nocur in a user record + if ($curator eq 'Author Submission' || $curator eq 'User Submission') { + $curator = 'FB_curator'; + + } + + + # build reference with information for this publication+workflow type combination + + my $FBrf_with_prefix="FB:".$FBrf; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP; + + # + + if ($curation_tag) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag; + } + + + if ($note) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = $note; + } + + + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} = 'via flag'; + + if ($debugging_note) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + + } + } + } + + } + + # next, see if there are any non-curated papers that should be marked as 'need curation' as they are high-priority + if (exists $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}) { + + foreach my $flag_type (sort keys %{$high_priority_flags}) { + + unless (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { + + if (exists $high_priority_flags->{$flag_type}->{$pub_id}) { + + my $ATP = $workflow_tag_mapping->{$workflow_type}->{'high_priority_override'}; + my $curation_tag = "ATP:0000353"; # high priority data + + my $note = "$high_priority_mapping->{$flag_type}"; + my $debugging_note = ''; + + + my $timestamp = $high_priority_flags->{$flag_type}->{$pub_id}->{"$high_priority_mapping->{$flag_type}"}[0]; + # set generic defaults that are overwritten later with more specific information + my $curator = 'FB_curator'; + my $curation_records = ''; + + + # get curator details for the high priority flag and use to override generic defaults where possible + my $high_priority_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $timestamp); + + if (defined $high_priority_details) { + + $curator = "$high_priority_details->{curator}"; + $curation_records = "$high_priority_details->{currecs}"; + + } + + + # build reference with information for this publication+workflow type combination + + my $FBrf_with_prefix="FB:".$FBrf; + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_updated} = $timestamp; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{updated_by} = $curator; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{mod_abbreviation} = "FB"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{reference_curie} = $FBrf_with_prefix; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{workflow_tag_id} = $ATP; + + if ($curation_tag) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{curation_tag} = $curation_tag; + } + + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + + my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$existing_note, $note"; + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = "$note"; + + } + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} = $curation_records; + + if ($debugging_note) { + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = $debugging_note; + } + } + } + + } + + } + + } + + +} + +# try to add relevant publication-level internal notes where appropriates + +foreach my $pub_id (sort keys %{$all_candidate_internal_notes}) { + + if (exists $workflow_status_data->{$pub_id}) { + + foreach my $int_note (sort keys %{$all_candidate_internal_notes->{$pub_id}}) { + + my $switch = 0; + my $reformatted_note = &clean_note("$int_note"); + $reformatted_note =~ s/\n/ /g; + + # for internal notes with a single timestamp + if (scalar @{$all_candidate_internal_notes->{$pub_id}->{$int_note}} == 1) { + + + my $int_note_timestamp = join '', @{$all_candidate_internal_notes->{$pub_id}->{$int_note}}; + + # A. first pass - go through different workflow types, from least to most detailed curation, and match up internal notes using timestamp and curation record info + # 1. go through the different workflow_tags to find cases where both the timestamp and curation records show a match + foreach my $workflow_type (sort keys %{$workflow_status_data->{$pub_id}}) { + + unless ($switch) { + my $workflow_type_timestamp = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created}"; + my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}"; + my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}"; + + + my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); + + # 2. if the timestamps of the workflow tag and internal note match + if ($int_note_timestamp eq $workflow_type_timestamp) { + + # 3. get the curation record details for the internal note to check against those of workflow_tag + if (defined $int_note_curator_details) { + + my $int_note_currecs = "$int_note_curator_details->{currecs}"; + my $int_note_curator = "$int_note_curator_details->{curator}"; + + + # 4. if the curation record for the workflow type is in the list of possibilities for the internal note + # and there is only one curator possibility for that timestamp + # then the internal note can be added to the entry for that workflow tag + if ($int_note_currecs =~ m/$workflow_type_currecs/ && $int_note_currecs ne 'multiple curators for same timestamp') { + + $switch++; + + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + + + my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$note||$int_note"); + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note"); + + } + } + + } + + } else { + + # if the timestamps do not match, but the curation record filename matches the type, its still OK to add the note + foreach my $relevant_record_type (@{$workflow_tag_mapping->{$workflow_type}->{'relevant_record_type'}}) { + + unless ($switch) { + + if (defined $int_note_curator_details) { + if ($int_note_curator_details->{currecs} =~ m/$relevant_record_type/) { + + $switch++; + + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + + + my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$existing_note||$int_note"); + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note"); + } + } + + } + + } + } + + } + + } + } + + + # B. second pass, if the internal note did not match in the loop above, add it to the note for the workflow_type if appropriate, so that the information gets into the Alliance + # (Many of the non-matching notes are edits to original curation, so makes sense to add to either skim/manual_indexing workflow types). + foreach my $workflow_type (reverse sort keys %{$workflow_tag_mapping}) { + + if (exists $workflow_tag_mapping->{$workflow_type}->{'second_pass'}) { + + unless ($switch) { + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}) { + + my $workflow_type_timestamp = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{date_created}"; + my $workflow_type_currecs = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs}"; + my $workflow_type_curator = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{created_by}"; + + my $int_note_curator_details = &get_relevant_curator_from_candidate_list_using_pub_and_timestamp($all_curation_record_data, $pub_id, $int_note_timestamp); + + if (defined $int_note_curator_details) { + + my $int_note_currecs = "$int_note_curator_details->{currecs}"; + my $int_note_curator = "$int_note_curator_details->{curator}"; + + my $debugging_note = "ADDED in second pass: $int_note ($int_note_curator, $int_note_currecs)"; + $debugging_note =~ s/\n/ /g; + + $switch++; + + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}) { + + + my $note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note}"; + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$note||$int_note"); + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{note} = &clean_note("$int_note"); + + } + + if (exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}) { + + + my $existing_note = "$workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note}"; + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$existing_note||$debugging_note"); + + + } else { + + $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} = &clean_note("$debugging_note"); + + } + } + + } + + } + } + } + + + } else { + + print $data_error_file "WARNING: internal note(s) with MULTIPLE TIMESTAMPS: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; + + + } + + unless ($switch) { + + print $data_error_file "WARNING: internal note(s) that could not match up: $pub_id\t$pub_id_to_FBrf->{$pub_id}->{'FBrf'}\t$pub_id_to_FBrf->{$pub_id}->{'type'}\t$reformatted_note\n"; + + + } + } + } +} + + +#print Dumper ($workflow_status_data); + +my $complete_data = {}; + + +foreach my $pub_id (sort keys %{$workflow_status_data}) { + + foreach my $workflow_type (sort keys %{$workflow_status_data->{$pub_id}}) { + + my $FBrf = $pub_id_to_FBrf->{$pub_id}->{'FBrf'}; + my $pub_type = $pub_id_to_FBrf->{$pub_id}->{'type'}; + + + + #store data for making json later + push @{$complete_data->{data}}, $workflow_status_data->{$pub_id}->{$workflow_type}->{json}; + + # simple output for testing/debugging + my $curated_by = $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'created_by'} ? "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'created_by'}" : ''; + my $updated_by = $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'updated_by'} ? "$workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'updated_by'}" : ''; + + + my $workflow_tag_id = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'workflow_tag_id'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'workflow_tag_id'} : ''; + my $date_created = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'date_created'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'date_created'} : ''; + + + my $curation_tag = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'curation_tag'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'curation_tag'} : ''; + my $note = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'note'} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{json}->{'note'} : ''; + $note =~ s/\n/ /g; + + my $curation_records = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{currecs} : ''; + my $debugging_note = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{note} : ''; + my $relevant_record_type = exists $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} ? $workflow_status_data->{$pub_id}->{$workflow_type}->{debugging}->{relevant_record_type} : ''; + + + print $plain_output_file "DATA:$pub_id\t$FBrf\t$pub_type\t$relevant_record_type\t$curated_by\t$curation_records\t$workflow_tag_id\t$curation_tag\t$date_created\t$note\t$debugging_note\n"; + } +} + +#print Dumper ($complete_data); + + +unless ($ENV_STATE eq "test") { + + my $json_metadata = &make_abc_json_metadata($db, $api_endpoint); + + $complete_data->{"metaData"} = $json_metadata; + my $complete_json_data = $json_encoder->encode($complete_data); + + print $json_output_file $complete_json_data; + + +} else { + + + foreach my $element (@{$complete_data->{"data"}}) { + + my $json_element = $json_encoder->encode($element); + + + my $cmd="curl -X 'POST' 'https://stage-literature-rest.alliancegenome.org/$api_endpoint/' -H 'accept: application/json' -H 'Authorization: Bearer $access_token' -H 'Content-Type: application/json' -d '$json_element'"; + my $raw_result = `$cmd`; + + + + if ($raw_result =~ m/^\d+$/) { + + print $plain_output_file "json post success\nJSON:\n$json_element\n\n"; + + } else { + + print $process_error_file "json post failed\nJSON:\n$json_element\nREASON:\n$raw_result\n#################################\n\n"; + + } + + + } + + + + +} + + + +close $json_output_file; +close $data_error_file; +close $process_error_file; +close $plain_output_file; + +print STDERR "##Ended processing: " . (scalar localtime) . "\n"; + +