-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathuid2TM.py
More file actions
151 lines (123 loc) · 5.59 KB
/
uid2TM.py
File metadata and controls
151 lines (123 loc) · 5.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import pandas as pd
import requests
import subprocess
import numpy as np
from Bio.PDB import PDBParser
from tqdm import tqdm #add progress bar
# Ask user for input CSV file path and working directory
csv_path = input("Enter the path to the CSV file (containing prefix, uid1, uid2): ")
work_dir = input("Enter the path to the working directory where results will be saved: ")
# Load CSV input
df = pd.read_csv(csv_path)
df['Rg1'] = np.nan
df['Rg2'] = np.nan
df['RMSD'] = np.nan
df['TM-Score1'] = np.nan
df['TM-Score2'] = np.nan
df['Aligned Length'] = np.nan
df['Seq_ID'] = np.nan
# Create working directory if it doesn't exist
if not os.path.exists(work_dir):
os.makedirs(work_dir)
# AlphaFold base URL for downloading PDB files
alphafold_url = "https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.pdb"
# Function to fetch protein structure
def fetch_protein_structure(uniprot_id, output_path):
url = alphafold_url.format(uniprot_id)
response = requests.get(url)
if response.status_code == 200:
with open(output_path, 'wb') as f:
f.write(response.content)
else:
raise ValueError(f"Error fetching PDB for {uniprot_id}, status code: {response.status_code}")
# Function to calculate radius of gyration using Bio.PDB
def calculate_radius_of_gyration(pdb_path):
parser = PDBParser(QUIET=True)
structure = parser.get_structure('protein', pdb_path)
# Extract all atom coordinates
atom_coords = np.array([atom.coord for atom in structure.get_atoms()])
if len(atom_coords) == 0:
raise ValueError(f"No atoms found in structure {pdb_path}")
# Calculate center of mass
center_of_mass = np.mean(atom_coords, axis=0)
# Calculate radius of gyration
rg = np.sqrt(np.mean(np.sum((atom_coords - center_of_mass)**2, axis=1)))
return rg
# Function to align structures using TM-align and save output to TMalign.out
def align_with_tmalign(pdb1_path, pdb2_path, output_dir):
# Path to TMalign.out file
tmalign_output_path = os.path.join(output_dir, "TMalign.out")
# Run TM-align with output redirected to TMalign.out
cmd = f"$TMalign {pdb1_path} {pdb2_path} > {tmalign_output_path}"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"TM-align failed: {result.stderr}")
return tmalign_output_path
# Function to parse TM-align output file
def parse_tmalign_output(tmalign_output_path):
aligned_length = rmsd = seq_id = tm_score1 = tm_score2 = None
with open(tmalign_output_path, 'r') as file:
for line in file:
if line.startswith("Aligned length="):
parts = line.split(',')
# Extract values after the last '=' for each part
aligned_length = int(parts[0].split('=')[-1].strip())
rmsd = float(parts[1].split('=')[-1].strip())
seq_id = float(parts[2].split('=')[-1].strip())
elif line.startswith("TM-score="):
# Extract only the first numeric value after '='
score_value = line.split('=')[1].split()[0] # First part after '=' before any space
if "normalized by length of Chain_1" in line:
tm_score1 = float(score_value)
elif "normalized by length of Chain_2" in line:
tm_score2 = float(score_value)
# Ensure all values are found, else raise an exception
if None in (aligned_length, rmsd, seq_id, tm_score1, tm_score2):
raise Exception("Failed to find relevant information in TM-align output.")
return aligned_length, rmsd, seq_id, tm_score1, tm_score2
# Iterate through each row in the DataFrame
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing protein pairs"):
#for index, row in df.iterrows():
prefix = row['prefix']
uid1 = row['uid1']
uid2 = row['uid2']
# Create a directory for the current protein pair
pair_dir = os.path.join(work_dir, prefix)
os.makedirs(pair_dir, exist_ok=True)
# Paths for PDB files
pdb1_path = os.path.join(pair_dir, f"{uid1}.pdb")
pdb2_path = os.path.join(pair_dir, f"{uid2}.pdb")
# Fetch the protein structures from AlphaFold
try:
fetch_protein_structure(uid1, pdb1_path)
fetch_protein_structure(uid2, pdb2_path)
except ValueError as e:
print(e)
continue
# Calculate radius of gyration for both proteins
try:
Rg1 = calculate_radius_of_gyration(pdb1_path)
Rg2 = calculate_radius_of_gyration(pdb2_path)
df.at[index, 'Rg1'] = Rg1
df.at[index, 'Rg2'] = Rg2
except Exception as e:
print(f"Error calculating radius of gyration for {uid1} or {uid2}: {e}")
continue
# Align protein structures and calculate TM-score and RMSD
try:
tmalign_output_file = align_with_tmalign(pdb1_path, pdb2_path, pair_dir)
aligned_length, rmsd, seq_id, tm_score1, tm_score2 = parse_tmalign_output(tmalign_output_file)
# Update DataFrame with parsed values
df.at[index, 'RMSD'] = rmsd
df.at[index, 'TM-Score1'] = tm_score1
df.at[index, 'TM-Score2'] = tm_score2
df.at[index, 'Aligned Length'] = aligned_length
df.at[index, 'Seq_ID'] = seq_id
except Exception as e:
print(f"Error aligning {uid1} and {uid2}: {e}")
continue
# Save the updated DataFrame to a new CSV file with "_TMscore" appended to the original filename
output_csv_path = os.path.join(work_dir, os.path.basename(csv_path).replace('.csv', '_TMscore.csv'))
df.to_csv(output_csv_path, index=False)
print(f"Results saved to {output_csv_path}")