Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
5a48a82
basic skeleton for populate_workflow_status.pl script
gm119 Jan 20, 2026
6720f78
tightening up regex for cam_full so will work for both phenotype and …
gm119 Jan 20, 2026
9f07754
adding get relevant data for working out status
gm119 Jan 20, 2026
f344a81
get publications that have genetic data (will be used for validation)
gm119 Jan 20, 2026
bca59bb
get list of publications to get status for
gm119 Jan 20, 2026
b87af37
first attempt at mapping hash
gm119 Jan 20, 2026
f7556e7
curation status data now in single reference, adding missing mapping
gm119 Jan 20, 2026
ae9ca72
First outline of script: adds workflow_tag information for different …
gm119 Jan 21, 2026
eedf805
adding basic nocur logic
gm119 Jan 21, 2026
a0d05c0
add doc for get_relevant_curator_from_candidate_list subroutine, cope…
gm119 Jan 21, 2026
5574912
documentation for check_and_validate_nocur
gm119 Jan 21, 2026
32d4c22
adding additional information that can be added via nocur flag (when …
gm119 Jan 21, 2026
daa3cac
changes to get_relevant_curator_from_candidate_list: simpler input, p…
gm119 Jan 22, 2026
e263e53
faster way to identify matching curator for a pub_id+timestamp combo:…
gm119 Jan 23, 2026
a3833b8
adding nocur details to output
gm119 Jan 23, 2026
31192dd
start putting debugging output in separate list for printing
gm119 Jan 23, 2026
e5e4a31
simplifying nocur checking and separating out debugging comments
gm119 Jan 23, 2026
3f13beb
moving check_and_validate_nocur to Util.pm'
gm119 Jan 23, 2026
52a5cc0
moving get_relevant_curator_from_candidate_list to Util.pm and tidyin…
gm119 Jan 23, 2026
dc27041
deal with nocur edge case
gm119 Jan 23, 2026
0f0dc98
mark high-priority papers as 'need curation' if not already curated i…
gm119 Jan 23, 2026
77e0c1c
adding harv_flag diseaseHP flags to high-priority 'needs curation' check
gm119 Jan 23, 2026
211cf00
first attempt at adding relevant internal note to note slto
gm119 Jan 23, 2026
af1bb96
change structure of reference hash to make it easier to add relevant …
gm119 Jan 26, 2026
5c0eead
first commit of get_all_pub_internal_notes_for_tet_wf
gm119 Jan 26, 2026
efd4c2f
more changes to get_all_pub_internal_notes_for_tet_wf
gm119 Jan 27, 2026
760ac50
more doc for get_all_pub_internal_notes_for_tet_wf
gm119 Jan 27, 2026
b29743b
first attempt at adding internal notes to appropriate workflow_tag el…
gm119 Jan 28, 2026
931ad21
tightening up regex for authors
gm119 Jan 30, 2026
5076882
another email regex tweak
gm119 Jan 30, 2026
435efc2
adding filter for cell_line note
gm119 Jan 30, 2026
8e85b93
adding clean_note to clean note text formatting
gm119 Jan 30, 2026
1d99ef6
another filter
gm119 Feb 1, 2026
d3efa13
another filter
gm119 Feb 1, 2026
5d5abf7
adding loop to try to add internal notes that do not match by timestamp
gm119 Feb 2, 2026
e155f1a
adding another filter
gm119 Feb 2, 2026
9b4001f
adding another filter
gm119 Feb 2, 2026
047fe46
adding second pass to get more internal notes in
gm119 Feb 2, 2026
b1b2d62
tidying
gm119 Feb 2, 2026
8412299
adding printing to output files, printing json, first attempt at test…
gm119 Feb 2, 2026
2b124db
doc tweak
gm119 Feb 3, 2026
ea35611
doc outlining script logic
gm119 Feb 3, 2026
ca0c217
adding wt_exp needs expression to high priority
gm119 Feb 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 215 additions & 3 deletions lib/AuditTable.pm
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use warnings;

require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(get_relevant_curator get_flag_info_with_audit_data get_timestamps_for_flag_with_suffix get_timestamps_for_flaglist_with_suffix get_timestamps_for_pubprop_value get_relevant_currec_for_datatype get_matching_pubprop_value_with_timestamps get_all_flag_info_with_timestamps);
our @EXPORT = qw(get_relevant_curator get_all_currec_data get_flag_info_with_audit_data get_timestamps_for_flag_with_suffix get_timestamps_for_flaglist_with_suffix get_timestamps_for_pubprop_value get_relevant_currec_for_datatype get_matching_pubprop_value_with_timestamps get_all_flag_info_with_timestamps get_all_pub_internal_notes_for_tet_wf);


=head1 MODULE: AuditTable
Expand Down Expand Up @@ -137,6 +137,69 @@ sub get_relevant_curator {
return ($data);
}

sub get_all_currec_data {



=head1 SUBROUTINE:
=cut

=head1

Title: get_all_currec_data
Usage: get_all_currec_data(database_handle);
Function: Stores curator and corresponding curation_record_filename information for all 'curated_by' pubprops, keyed on pub_id.
Example: my $all_curation_record_data = &get_all_currec_data($dbh);

Arguments: database handle

Returns: data structure with the following format:

$data->{$pub_id}->{$timestamp}->{$curator}->{$curation_record_filename}


The get_relevant_curator_from_candidate_list_using_pub_and_timestamp in lib/Util.pm can be used to filter the returned list for the curator (and corresponding curation_record_filename information) that matches a specific pub_id plus timestamp combination.


=cut


unless (@_ == 1) {

die "Wrong number of parameters passed to the get_all_currec_data subroutine\n";
}

my ($dbh) = @_;

my $data = {};

my $sql_query=sprintf("select distinct pp.pub_id, pp.value, ac.transaction_timestamp from pubprop pp, cvterm c, audit_chado ac where ac.audited_table='pubprop' and ac.audit_transaction='I' and pp.pubprop_id=ac.record_pkey and c.cvterm_id=pp.type_id and c.name = 'curated_by' order by ac.transaction_timestamp");

my $db_query = $dbh->prepare($sql_query);
$db_query->execute or die "WARNING: ERROR: Unable to execute get_all_currec_data query ($!)\n";


while (my ($pub_id, $curated_by_value, $timestamp) = $db_query->fetchrow_array) {


if ($curated_by_value =~ m/^Curator: (.+?);Proforma: (.+?);timelastmodified: (.*)$/) {

my $curator = $1;
my $curation_record_filename = $2;
my $timelastmodified = $3;

$data->{$pub_id}->{$timestamp}->{$curator}->{$curation_record_filename}++;

} else {
# not expecting to trip this error
print "ERROR: wrong curated_by pubprop format for pub_id: $pub_id, pubprop: $curated_by_value\n";
}
}

return $data;
}



sub get_flag_info_with_audit_data {

Expand Down Expand Up @@ -470,7 +533,7 @@ sub get_relevant_currec_for_datatype {

Title: get_relevant_currec_for_datatype
Usage: get_relevant_currec_for_datatype(database_handle,datatype);
Function: Gets timestamp information for curation records that are expected to contain data of a particular type based on the their filename, for the datatype specified in the second argument.
Function: Gets timestamp information for curation records that are expected to contain data/curation of a particular type based on the their filename, for the data/curation type specified in the second argument.
Example: my $currec_for_cell_line = &get_relevant_currec_for_datatype($dbh,'cell_line');

Arguments:
Expand All @@ -488,6 +551,7 @@ Arguments:
# only use standard filenames for matching so that can be sure that curation of the datatype is complete
my $datatype_mapping = {

# filename types used for topic curation status
'cell_line' => '.+?\.(cell|cell_multiple|cell_multi)\..+?',
'phys_int' => '.+?\.(int|int_miRNA)\..+?',
'DO' => '.+?\.DO\..+?',
Expand All @@ -496,8 +560,15 @@ Arguments:
'chemical' => '.+?\.chem\..+?',
'args' => '.+?\.args\..+?',
'phen' => '[a-z][a-z][0-9]{1,}\.phen',
'cam_full' => '(ma|sb|rd|sf|tj|al|sm|cm|pm|gm|ao|cp|lp|sp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?',
'humanhealth' => '.+?\.(hh|hh_multiple|hds|hds_multiple|hh_[0-9]{1,}|hds_rvw)\..+?',
# filename types used for workflow status
'thin' => '[a-z][a-z][0-9]{1,}\.thin',
'user' => '[a-z][a-z][0-9]{1,}\.user',
'skim' => '.+?\.(skim|skim_multiple|skim_multi)(\..+)?',
'gene_full' => '[a-z][a-z][0-9]{1,}\.full', # publications that have had full manual indexing (listing) of genes, and genes are the only entity type relevant to manual indexing in the publication
'cam_no_suffix' => '(ma|sb|rd|sf|tj|kk|al|sm|cm|pm|gm|ao|cp|lp|sp|sr|rs|ra|ds|vt|ew|cy|pu|ga|ha|hb|rc|rf|pg|st|rz)[0-9]{1,}',
# filename types used for both topic curation status AND workflow status
'cam_full' => '(ma|sb|rd|al|sm|cm|pm|gm|lp|sr|rs|ra|ds|ew|cy)[0-9]{1,}(\.(h|hf))?', # publications that should have had full curation of both any genetic reagent data AND any phenotype data



Expand Down Expand Up @@ -719,3 +790,144 @@ o $audit_timestamp is a timestamp from the audit_chado table. The $audit_timesta
return $data;

}





sub get_all_pub_internal_notes_for_tet_wf {


=head1 SUBROUTINE:
=cut

=head1

Title: get_all_pub_internal_notes_for_tet_wf
Usage: get_all_pub_internal_notes_for_tet_wf(database_handle);
Function: Gets all publication level internal notes that we want to try to submit to the Alliance ABC in either the topic-entity-tg (tet) editor or the workflow editor, additionally filtered using the regexes in the array reference provided in the second argument.
Example: my $all_pub_internal_notes_for_tet_wf = &get_all_pub_internal_notes_for_tet_wf($dbh, $additional_filters);

Arguments:

o database handle

o $additional_filters - reference to an array of additional regular expressions to be used to filter the internal notes before they are returned. (This array reference can be empty).

Returns:

The returned hash reference has the following structure:

@{$data->{$pub_id}->{$line}}, $audit_timestamp;


o $pub_id is the pub_id of the reference.

o $line is the complete internal note; if the internal note contains multiple lines they will be present in the returned $line, so this may need to be converted to spaces when submitting to the ABC if you want the note to appear as a single line.

o $audit_timestamp is the 'I' timestamp(s) from the audit_chado table for the matching $line. The $audit_timestamp values are sorted earliest to latest within the array.


The subroutine only returns internal notes that we want to end up in either the topic-entity-tg (tet) editor or the workflow editor in the Alliance ABC. Internal notes of two kinds are thus removed from returned output by default, using the regular expressions in the following array references:


o $ignore - internal notes that are related to the bibliographic data and are no longer relevant.

o $email - email address information (either community curation or the submitter of a personal communication) - these will be submitted elsewhere in the ABC.


In addition, any regular expressions passed via the $additional_filters array reference are also used to filter the returned output. Note that the regexes are used to remove matching text from each internal note; any remaining text after the substitutions is returned. This strategy is necessary as in some cases, a single internal note contains multiple lines that may need different treatment when being exported to the ABC. For example, some internal notes in FB are not being submitted as free text 'notes' to the ABC, but are instead being turned into an ATP term representing a topic, topic status or a controlled tag; in these cases, the regexes in $additional_filters can be used to remove the relevant internal notes so that they are not then redundantly also submitted to the ABC as a free text note.

This means that any regex in $additional_filters should typically be specified to either remove a *complete* line (using ^ and $ in the regex), or be sufficiently specific that it is safe to remove the text when it is part of a line (e.g. similar to the email regexes in the $email array reference below).

(It isn't possible to just split on newline to separate out each 'row' of a given 'multiple lines' internal note, because in some cases, the multiple lines correspond to a paragraph (i.e. the multiple lines should stay together when submitted to the ABC) but in others, each row represents a different kind of internal note that may need to be split up and treated differently as they are submitted to the ABC (e.g. those that are being converted into an ATP term as described above).



=cut

unless (@_ == 2) {

die "Wrong number of parameters passed to the get_all_pub_internal_notes_for_tet_wf subroutine\n";
}


my ($dbh, $additional_filters) = @_;

my $data = {};


my $ignore = [

'Automated trawl for PMID, ggrumbli 090330',
'BIOSIS Gene Name field: .+',
'^[?:]?Supplement:.+$', # there are some cases with ? or : at the beginning of the Supplement: line
];


my $email = [

'(Author|User|Submitter|submitter): ?.*?\<?\S+@\S+?\>?$',
'(Author|User|Submitter|submitter): ?.*?\<?\S+@\S+?\>? [a-z]{2}[0-9]{1,}\.$',
'(Author|User|Submitter|submitter): ?.*?\<?\S+_at_\S+?\>?$',
'(Author|User|Submitter|submitter): ?.*?\<?\S+_AT_\S+?\>?$',
'(Author|User|Submitter|submitter) .*?\<?\S+@\S+?\>?\. ?[a-z]{2}[0-9]{1,}\.$',

# remove email info from beginning of line - require <> around email part of regex else is not strict enough
'(Author|User|Submitter|submitter): .*?\<\S+?@\S+?\>\. ',

'^(Author|User|Submitter|submitter):$',



];


my $sql_query = sprintf("select distinct pp.pub_id, pp.value, ac.transaction_timestamp from pubprop pp, cvterm c, audit_chado ac where c.cvterm_id=pp.type_id and c.name ='internalnotes' and ac.audited_table='pubprop' and ac.audit_transaction = 'I' and pp.pubprop_id=ac.record_pkey order by ac.transaction_timestamp");
my $db_query = $dbh->prepare($sql_query);
$db_query->execute or die "WARNING: ERROR: Unable to execute get_timestamps_for_pubprop_value query ($!)\n";


while (my ($pub_id, $value, $audit_timestamp) = $db_query->fetchrow_array) {


foreach my $regex (@{$ignore}) {

$value =~ s/$regex//mg;


}

foreach my $regex (@{$email}) {

$value =~ s/$regex//mg;


}

foreach my $regex (@{$additional_filters}) {

$value =~ s/$regex//mg;


}


$value =~ s/^ +//;
$value =~ s/ +$//;
$value =~ s/^\n+//;
$value =~ s/\n+$//;

$value =~ s/\t//g;

if ($value ne '') {

push @{$data->{$pub_id}->{$value}}, $audit_timestamp;

}
}

return $data;

}

Loading