-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path02_data_exploration_simplified.py
More file actions
160 lines (123 loc) Β· 5.69 KB
/
02_data_exploration_simplified.py
File metadata and controls
160 lines (123 loc) Β· 5.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""
Step 2: Basic Data Exploration for Audio based object classification
======================================================================
This script performs basic exploratory data analysis (EDA) on the
UrbanSound8K dataset, focusing on essential information for the
YAMNet training pipeline.
Analysis includes:
- Class distribution and balance
- Basic dataset statistics
- File validation
"""
import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
class AudioDataExplorer:
def __init__(self, data_dir="data"):
self.data_dir = Path(data_dir)
self.metadata_path = self.data_dir / "filtered_metadata.csv"
self.audio_dir = self.data_dir / "UrbanSound8K" / "audio"
def load_metadata(self):
"""Load and validate filtered metadata"""
print("π Loading Filtered Metadata...")
if not self.metadata_path.exists():
print("β Filtered metadata not found! Run 01_dataset_acquisition.py first.")
return None
df = pd.read_csv(self.metadata_path)
print(f"β
Loaded {len(df)} samples across {df['class'].nunique()} classes")
return df
def analyze_class_distribution(self, df):
"""Analyze class distribution"""
print("\nπ Class Distribution Analysis...")
print("=" * 60)
# Class counts and percentages
class_counts = df['class'].value_counts().sort_index()
total_samples = len(df)
print(f"π Dataset Overview:")
print(f" Total samples: {total_samples}")
print(f" Number of classes: {df['class'].nunique()}")
print(f" Folds: {sorted(df['fold'].unique())}")
print()
print(f"π Class Distribution:")
print(f"{'Class':15} {'Count':>6} {'Percentage':>10} {'Adequacy':>12}")
print("-" * 50)
for class_name in sorted(class_counts.index):
count = class_counts[class_name]
percentage = (count / total_samples) * 100
adequacy = "Good" if count >= 100 else "Limited" if count >= 50 else "Poor"
print(f"{class_name:15} {count:6d} {percentage:9.1f}% {adequacy:>12}")
# Fold distribution
print(f"\nπ Fold Distribution:")
fold_counts = df['fold'].value_counts().sort_index()
for fold, count in fold_counts.items():
print(f" Fold {fold}: {count:3d} samples")
return class_counts
def validate_audio_files(self, df, sample_size=10):
"""Basic validation of audio file accessibility"""
print(f"\nπ Validating Audio File Access (sample: {sample_size})...")
# Sample files from each class
validation_results = {}
total_checked = 0
total_accessible = 0
for class_name in df['class'].unique():
class_files = df[df['class'] == class_name].sample(min(sample_size, len(df[df['class'] == class_name])))
accessible = 0
checked = 0
for _, row in class_files.iterrows():
audio_path = self.audio_dir / f"fold{row['fold']}" / row['slice_file_name']
checked += 1
total_checked += 1
if audio_path.exists():
accessible += 1
total_accessible += 1
validation_results[class_name] = {
'checked': checked,
'accessible': accessible,
'accessibility_rate': (accessible / checked) * 100 if checked > 0 else 0
}
# Report results
print(f"π File Accessibility Results:")
print(f"{'Class':15} {'Checked':>8} {'Accessible':>10} {'Rate':>8}")
print("-" * 50)
for class_name, results in validation_results.items():
rate = results['accessibility_rate']
print(f"{class_name:15} {results['checked']:8d} {results['accessible']:10d} {rate:7.1f}%")
overall_rate = (total_accessible / total_checked) * 100 if total_checked > 0 else 0
print(f"\nπ Overall Accessibility: {total_accessible}/{total_checked} ({overall_rate:.1f}%)")
if overall_rate < 95:
print("β οΈ Warning: Some audio files are not accessible!")
else:
print("β
All sampled audio files are accessible")
return validation_results
def main():
"""Main exploration function"""
print("π΅ Basic Data Exploration for Audio Classification")
print("=" * 60)
try:
# Initialize explorer
explorer = AudioDataExplorer()
# Load metadata
df = explorer.load_metadata()
if df is None:
return
# Analyze class distribution
class_counts = explorer.analyze_class_distribution(df)
# Validate audio file access
validation_results = explorer.validate_audio_files(df, sample_size=5)
print("\nβ
Basic Data Exploration Complete!")
print("\nπ Summary:")
print(f" - {len(df)} total samples")
print(f" - {df['class'].nunique()} audio classes")
print(f" - {df['fold'].nunique()} data folds")
print(f" - Audio files validated")
print("\nπ Next Steps:")
print(" - Run 03_data_cleaning.py for data validation and cleaning")
print(" - Proceed with 04_data_preprocessing.py for feature preparation")
except Exception as e:
print(f"β Error during data exploration: {str(e)}")
raise
if __name__ == "__main__":
main()