sequence_alignment/evaluate_simple_score.py at main · leonokida/sequence_alignment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import glob
import json
import numpy as np
from Bio import SeqIO

def calculate_simple_score_from_strings(seqs):
    """
    Calculates Sum of Pairs score with:
    Match: +1
    Mismatch: -1
    Gap-Gap: 0
    """
    score = 0
    num_seqs = len(seqs)

    for i in range(num_seqs):
        seq1 = seqs[i]
        for j in range(i + 1, num_seqs):
            seq2 = seqs[j]

            # We can vectorize this comparison
            s1_arr = np.array(list(seq1))
            s2_arr = np.array(list(seq2))

            # Matches (excluding gap-gap)
            matches = (s1_arr == s2_arr) & (s1_arr != '-') & (s2_arr != '-')

            # Gap-Gap (ignored usually, or 0)
            gap_gap = (s1_arr == '-') & (s2_arr == '-')

            # Mismatches (including gap-residue)
            # Everything else is a mismatch
            mismatches = ~matches & ~gap_gap

            score += np.sum(matches) * 1
            score += np.sum(mismatches) * -1

    return score

def main():
    base_dir = "experiments_full"
    results = {}

    # Find all best_alignment.fasta files
    search_pattern = os.path.join(base_dir, "**", "best_alignment.fasta")
    files = glob.glob(search_pattern, recursive=True)

    print(f"Found {len(files)} alignment files.")

    for file_path in files:
        # Extract experiment info from path
        parts = file_path.split(os.sep)

        try:
            # Let's look for 'run_' in path to anchor
            run_idx = -1
            for k, part in enumerate(parts):
                if part.startswith('run_'):
                    run_idx = k
                    break

            if run_idx != -1:
                exp_type = parts[run_idx - 1]
            else:
                continue

            # Read alignment using SeqIO to handle potential length discrepancies
            records = list(SeqIO.parse(file_path, "fasta"))

            if not records:
                continue

            # Normalize lengths (pad with gaps if necessary)
            max_len = max(len(r.seq) for r in records)

            # Create a list of sequence strings, padded
            seqs = []
            for r in records:
                s = str(r.seq)
                if len(s) < max_len:
                    s += "-" * (max_len - len(s))
                seqs.append(s)

            score = calculate_simple_score_from_strings(seqs)

            if exp_type not in results:
                results[exp_type] = []

            results[exp_type].append(score)

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    # Print Summary
    print("\n--- Simple Score Evaluation (+1/-1) ---\n")
    print(f"{'Experiment':<25} | {'Mean Score':<15} | {'Std Dev':<15} | {'Min':<10} | {'Max':<10} | {'Count':<5}")
    print("-" * 90)

    for exp_type, scores in sorted(results.items()):
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        min_score = np.min(scores)
        max_score = np.max(scores)
        count = len(scores)

        print(f"{exp_type:<25} | {mean_score:<15.2f} | {std_score:<15.2f} | {min_score:<10} | {max_score:<10} | {count:<5}")

if __name__ == "__main__":
    main()