Skip to content

Commit bf67fbb

Browse files
committed
3 #71
1 parent 1aece93 commit bf67fbb

2 files changed

Lines changed: 839 additions & 562 deletions

File tree

Lines changed: 354 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "b5f1facc",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"# Import necessary libraries\n",
11+
"import pandas as pd\n",
12+
"import numpy as np\n",
13+
"import matplotlib.pyplot as plt\n",
14+
"import seaborn as sns\n",
15+
"from collections import Counter\n",
16+
"\n",
17+
"# Set display options\n",
18+
"pd.set_option('display.max_columns', None)\n",
19+
"pd.set_option('display.max_rows', 100)\n",
20+
"pd.set_option('display.width', None)\n",
21+
"\n",
22+
"# Set plot style\n",
23+
"plt.style.use('seaborn-v0_8-whitegrid')\n",
24+
"sns.set_palette('husl')"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": null,
30+
"id": "5452d8e5",
31+
"metadata": {},
32+
"outputs": [],
33+
"source": [
34+
"# Create VirtualDB configuration\n",
35+
"# This configuration defines how to map the fields of different datasets and how to associate DTO comparative analysis data\n",
36+
"\n",
37+
"import tempfile\n",
38+
"from pathlib import Path\n",
39+
"\n",
40+
"config_yaml = \"\"\"\n",
41+
"repositories:\n",
42+
" BrentLab/harbison_2004:\n",
43+
" dataset:\n",
44+
" harbison_2004:\n",
45+
" sample_id:\n",
46+
" field: sample_id\n",
47+
" carbon_source:\n",
48+
" field: condition\n",
49+
" path: media.carbon_source.compound\n",
50+
" temperature_celsius:\n",
51+
" field: condition\n",
52+
" path: temperature_celsius\n",
53+
" dtype: numeric\n",
54+
" environmental_condition:\n",
55+
" field: condition\n",
56+
" regualtor_locus_tag:\n",
57+
" field: regulator_locus_tag\n",
58+
" regulator_symbol:\n",
59+
" field: regulator_symbol\n",
60+
"\n",
61+
" comparative_analyses:\n",
62+
" - repo: BrentLab/yeast_comparative_analysis\n",
63+
" dataset: dto\n",
64+
" via_field: binding_id\n",
65+
"\n",
66+
" BrentLab/rossi_2021:\n",
67+
" carbon_source: \n",
68+
" path: media.carbon_source.compound\n",
69+
" temperature_celsius: \n",
70+
" path: temperature_celsius\n",
71+
" dataset:\n",
72+
" rossi_2021_af_combined:\n",
73+
" sample_id: \n",
74+
" field: sample_id\n",
75+
" regulator_locus_tag:\n",
76+
" field: regulator_locus_tag\n",
77+
" target_locus_tag:\n",
78+
" field: target_locus_tag\n",
79+
"\n",
80+
" comparative_analyses:\n",
81+
" - repo: BrentLab/yeast_comparative_analysis\n",
82+
" dataset: dto\n",
83+
" via_field: binding_id\n",
84+
"\n",
85+
" BrentLab/mahendrawada_2025:\n",
86+
" dataset:\n",
87+
" reprocessed_diffcontrol_5prime:\n",
88+
" sample_id:\n",
89+
" field: sample_id\n",
90+
" control_source:\n",
91+
" field: control_source\n",
92+
" regulator_locus_tag:\n",
93+
" field: regulator_locus_tag\n",
94+
" regulator_symbol:\n",
95+
" field: regulator_symbol\n",
96+
" environmental_condition:\n",
97+
" field: condition\n",
98+
" temperature_celsius:\n",
99+
" field: condition\n",
100+
" path: temperature_celsius\n",
101+
" dtype: numeric\n",
102+
" media_name:\n",
103+
" field: condition\n",
104+
" path: media.name\n",
105+
" carbon_source:\n",
106+
" field: condition\n",
107+
" path: media.carbon_source\n",
108+
"\n",
109+
" comparative_analyses:\n",
110+
" - repo: BrentLab/yeast_comparative_analysis\n",
111+
" dataset: dto\n",
112+
" via_field: binding_id\n",
113+
"\n",
114+
"\n",
115+
" BrentLab/callingcards:\n",
116+
" carbon_source: \n",
117+
" path: media.carbon_source.compound\n",
118+
" temperature_celsius: \n",
119+
" path: temperature_celsius\n",
120+
" dataset:\n",
121+
" annotated_features:\n",
122+
" id:\n",
123+
" field: id\n",
124+
" regulator_locus_tag:\n",
125+
" field: target_locus_tag\n",
126+
" regulator_symbol:\n",
127+
" field: target_symbol\n",
128+
" \n",
129+
" comparative_analyses:\n",
130+
" - repo: BrentLab/yeast_comparative_analysis\n",
131+
" dataset: dto\n",
132+
" via_field: binding_id\n",
133+
" \n",
134+
" BrentLab/hackett_2020:\n",
135+
" dataset:\n",
136+
" hackett_2020:\n",
137+
" sample_id:\n",
138+
" field: sample_id\n",
139+
" dtype: numeric\n",
140+
" regulator_locus_tag:\n",
141+
" field: regulator_locus_tag\n",
142+
" temperature_celsius:\n",
143+
" path: temperature_celsius\n",
144+
" dtype: numeric\n",
145+
" cultivation_method:\n",
146+
" path: cultivation_method\n",
147+
" media_name:\n",
148+
" path: media.name\n",
149+
" induction_system:\n",
150+
" field: mechanism\n",
151+
" inducer_compound:\n",
152+
" field: mechanism\n",
153+
" path: definitions.inducer\n",
154+
" nutrient_restriction:\n",
155+
" field: restriction\n",
156+
" log2fc:\n",
157+
" field: log2_shrunken_timecourses\n",
158+
" dtype: numeric\n",
159+
" log2_raw_ratio:\n",
160+
" field: log2_ratio\n",
161+
" dtype: numeric\n",
162+
" time_point:\n",
163+
" field: time\n",
164+
" dtype: numeric\n",
165+
"\n",
166+
" comparative_analyses:\n",
167+
" - repo: BrentLab/yeast_comparative_analysis\n",
168+
" dataset: dto\n",
169+
" via_field: perturbation_id\n",
170+
"\n",
171+
" BrentLab/kemmeren_2014:\n",
172+
" dataset:\n",
173+
" kemmeren_2014:\n",
174+
" sample_id:\n",
175+
" field: sample_id\n",
176+
" carbon_source:\n",
177+
" path: media.carbon_source.compound\n",
178+
" temperature_celsius:\n",
179+
" path: temperature_celsius\n",
180+
" dtype: numeric\n",
181+
"\n",
182+
" comparative_analyses:\n",
183+
" - repo: BrentLab/yeast_comparative_analysis\n",
184+
" dataset: dto\n",
185+
" via_field: perturbation_id\n",
186+
"\n",
187+
" BrentLab/yeast_comparative_analysis:\n",
188+
" dataset:\n",
189+
" dto:\n",
190+
" binding_id:\n",
191+
" field: binding_id\n",
192+
" perturbation_id:\n",
193+
" field: perturbation_id\n",
194+
" fdr:\n",
195+
" field: dto_fdr\n",
196+
" dtype: numeric\n",
197+
" pvalue:\n",
198+
" field: dto_empirical_pvalue\n",
199+
" dtype: numeric\n",
200+
" binding_threshold:\n",
201+
" field: binding_rank_threshold\n",
202+
" dtype: numeric\n",
203+
" perturbation_threshold:\n",
204+
" field: perturbation_rank_threshold\n",
205+
" dtype: numeric\n",
206+
" binding_set_size:\n",
207+
" field: binding_set_size\n",
208+
" dtype: numeric\n",
209+
" perturbation_set_size:\n",
210+
" field: perturbation_set_size\n",
211+
" dtype: numeric\n",
212+
"\n",
213+
"factor_aliases:\n",
214+
" carbon_source:\n",
215+
" glucose: [D-glucose, dextrose, glu]\n",
216+
" galactose: [D-galactose, gal]\n",
217+
" raffinose: [D-raffinose]\n",
218+
"\n",
219+
"missing_value_labels:\n",
220+
" carbon_source: \"unspecified\"\n",
221+
"\n",
222+
"description:\n",
223+
" carbon_source: The carbon source provided during growth\n",
224+
" temperature_celsius: Growth temperature in degrees Celsius\n",
225+
" environmental_condition: Named environmental condition\n",
226+
"\"\"\"\n",
227+
"\n",
228+
"# Save the configuration to a temporary file\n",
229+
"temp_config = Path(tempfile.mkdtemp()) / \"vdb_config.yaml\"\n",
230+
"temp_config.write_text(config_yaml)\n",
231+
"\n",
232+
"print(f\"✅ Configuration file saved at: {temp_config}\")"
233+
]
234+
},
235+
{
236+
"cell_type": "code",
237+
"execution_count": null,
238+
"id": "1550d737",
239+
"metadata": {},
240+
"outputs": [],
241+
"source": [
242+
"# Initialize VirtualDB\n",
243+
"from tfbpapi.virtual_db import VirtualDB\n",
244+
"\n",
245+
"# Token authentication required\n",
246+
"hf_token = os.getenv(\"HF_TOKEN\", None)\n",
247+
"\n",
248+
"vdb = VirtualDB(str(temp_config), token=hf_token)\n",
249+
"\n",
250+
"print(\"✅ VirtualDB initialized successfully!\")\n",
251+
"print(f\"Number of configured repositories: {len(vdb.config.repositories)}\")\n",
252+
"\n",
253+
"# List all configured datasets\n",
254+
"print(\"\\nConfigured datasets:\")\n",
255+
"for repo_id, repo_config in vdb.config.repositories.items():\n",
256+
" if repo_config.dataset:\n",
257+
" for config_name in repo_config.dataset.keys():\n",
258+
" print(f\" - {repo_id}/{config_name}\")\n"
259+
]
260+
},
261+
{
262+
"cell_type": "markdown",
263+
"id": "e6dc4ce8",
264+
"metadata": {},
265+
"source": [
266+
"Added the ability to perform comparative analysis to the query."
267+
]
268+
},
269+
{
270+
"cell_type": "code",
271+
"execution_count": null,
272+
"id": "c9b1b241",
273+
"metadata": {},
274+
"outputs": [],
275+
"source": [
276+
"all_p001 = vdb.query(\n",
277+
" datasets=[(\"BrentLab/harbison_2004\", \"harbison_2004\")],\n",
278+
" fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"pvalue\", \"perturbation_id\"],\n",
279+
")\n",
280+
"\n",
281+
"all_p001.head(10)"
282+
]
283+
},
284+
{
285+
"cell_type": "code",
286+
"execution_count": null,
287+
"id": "f7f6a7f4",
288+
"metadata": {},
289+
"outputs": [],
290+
"source": [
291+
"# Even if not specified in the `fields` parameter, the filter options will still be retained within the `fields` parameter.\n",
292+
"all_p001 = vdb.query(\n",
293+
" datasets=[(\"BrentLab/hackett_2020\", \"hackett_2020\")],\n",
294+
" filters={\n",
295+
" \"pvalue\": (\"<=\", 0.01) \n",
296+
" },\n",
297+
" fields=[\"sample_id\", \"carbon_source\", \"temperature_celsius\", \"perturbation_id\",\"binding_id\"],\n",
298+
")\n",
299+
"\n",
300+
"all_p001.head(10)"
301+
]
302+
},
303+
{
304+
"cell_type": "markdown",
305+
"id": "5bd97850",
306+
"metadata": {},
307+
"source": [
308+
"Create a function called query_dto that is specifically responsible for retrieving the DTO data for the specified binding and perturbation datasets"
309+
]
310+
},
311+
{
312+
"cell_type": "code",
313+
"execution_count": null,
314+
"id": "20864108",
315+
"metadata": {},
316+
"outputs": [],
317+
"source": [
318+
"# Example: Query the intersection of harbison and hackett, filter for pvalue <= 0.01\n",
319+
"dto_result = vdb.query_dto(\n",
320+
" binding_dataset=(\"BrentLab/harbison_2004\", \"harbison_2004\"),\n",
321+
" perturbation_dataset=(\"BrentLab/hackett_2020\", \"hackett_2020\"),\n",
322+
" dto_filters={\"pvalue\": (\"<=\", 0.01)},\n",
323+
" fields=[\"binding_id\", \"perturbation_id\", \"pvalue\", \"fdr\",\"sample_id\", \"carbon_source\", \"temperature_celsius\"]\n",
324+
")\n",
325+
"\n",
326+
"print(f\"Found {len(dto_result)} DTO records\")\n",
327+
"print(f\"Column names: {list(dto_result.columns)}\")\n",
328+
"dto_result.head(10)"
329+
]
330+
},
331+
{
332+
"cell_type": "code",
333+
"execution_count": null,
334+
"id": "15f63f8a",
335+
"metadata": {},
336+
"outputs": [],
337+
"source": [
338+
"all_p001 = vdb.query( \n",
339+
" datasets=[(\"BrentLab/callingcards\", \"annotated_features\")],\n",
340+
" complete=False\n",
341+
")\n",
342+
"all_p001.head()\n",
343+
"print(f\"总共有 {len(all_p001)} 行数据\")"
344+
]
345+
}
346+
],
347+
"metadata": {
348+
"language_info": {
349+
"name": "python"
350+
}
351+
},
352+
"nbformat": 4,
353+
"nbformat_minor": 5
354+
}

0 commit comments

Comments
 (0)