-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtsv_processor.c
More file actions
259 lines (219 loc) · 8.67 KB
/
tsv_processor.c
File metadata and controls
259 lines (219 loc) · 8.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>
#include <assert.h>
#define CHAR_CR '\r' /* CR character, in PC-format text files */
#define CHAR_NL '\n' /* newline character, ends each input line*/
#define CHAR_TB '\t' /* tab character, ends each TSV field*/
#define STATUS_EOL 1 /* return value for end of line field*/
#define STATUS_EOF 2 /* return value for end of file field*/
#define STATUS_NML 3 /* return value if neither of previous two*/
#define MAXFIELDLEN 50 /* maximum field length for each field*/
#define MAXCOLS 5 /* we expect exactly 5 columns*/
#define MAXROWS 1000 /* maximum number of rows in the TSV file*/
/* one tsv field, stored within a fixed-length character string */
typedef char field_t[MAXFIELDLEN + 1];
/**************************************************************/
int mygetchar(void);
int getfield(field_t dest);
void print_stage1(field_t data[][MAXCOLS], int nrows, field_t headers[]);
void print_stage2_output(field_t data[][MAXCOLS], int nrows, field_t headers[], int *sort_columns, int num_columns);
void insertion_sort(field_t data[][MAXCOLS], int nrows, int *sort_columns, int num_columns);
int compare_rows(field_t row1[], field_t row2[], int *sort_columns, int num_columns);
void print_stage3_header(int *sort_columns, int num_columns, field_t headers[]);
void tabulate_data(field_t data[][MAXCOLS], int nrows, int *sort_columns, int num_columns);
/**************************************************************/
int main(int argc, char *argv[]) {
field_t headers[MAXCOLS]; /*array to store the header row*/
field_t data[MAXROWS][MAXCOLS]; /* 2D array to store the TSV data*/
int sort_columns[MAXCOLS]; /* array to store sort column indices*/
int num_columns = argc - 1; /* number of sort columns provided*/
int nrows = 0; /* row counter*/
int status, nfields;
/* If no sort columns are provided, exit after Stage 1 */
if (argc == 1) {
printf("No column numbers provided. Exiting after Stage 1.\n");
return 0;
}
for (int i = 1; i < argc; i++) {
sort_columns[i - 1] = atoi(argv[i]) - 1;
if (sort_columns[i - 1] < 0 || sort_columns[i - 1] >= MAXCOLS) {
printf("Invalid column. Please specify a column between 1 and %d.\n", MAXCOLS);
return 1;
}
}
/* read the header row*/
for (int i = 0; i < MAXCOLS; i++) {
getfield(headers[i]);
}
/*read the data rows*/
while (nrows < MAXROWS) {
nfields = 0;
while (nfields < MAXCOLS) {
status = getfield(data[nrows][nfields]);
nfields++;
if (status == STATUS_EOL || status == STATUS_EOF) {
break;
}
}
if (nfields == MAXCOLS) {
nrows++;
}
if (status == STATUS_EOF) {
break;
}
}
/* print Stage 1 output*/
print_stage1(data, nrows, headers);
/* sort the data using insertion sort (Stage 2)*/
insertion_sort(data, nrows, sort_columns, num_columns);
/* print Stage 2 output: first, middle, and last rows after sorting*/
print_stage2_output(data, nrows, headers, sort_columns, num_columns);
/* print Stage 3 header*/
print_stage3_header(sort_columns, num_columns, headers);
/* proceed to Stage 3: Tabulation*/
tabulate_data(data, nrows, sort_columns, num_columns);
return 0;
}
/**************************************************************/
/* read characters and build a string, stopping when a tab or newline
as encountered, with the return value indicating what that
terminating character was
*/
int getfield(field_t dest) {
int ch, nchs = 0;
int status = STATUS_NML;
dest[0] = '\0';
while ((ch = mygetchar()) && (ch != CHAR_TB) && (ch != CHAR_NL) && (ch != EOF)) {
if (nchs < MAXFIELDLEN) {
/* ok to save this character */
dest[nchs++] = ch;
dest[nchs] = '\0';
}
}
if (ch == EOF) {
status = STATUS_EOF;
} else if (ch == CHAR_NL) {
status = STATUS_EOL;
} else if (ch == CHAR_TB) {
status = STATUS_NML;
}
return status;
}
/**************************************************************/
/* read a single character, bypassing any CR characters encountered,
so as to work correctly with either PC-type or Unix-type input
*/
int mygetchar(void) {
int c;
while ((c = getchar()) == CHAR_CR) {
/* empty loop body */
}
return c;
}
/* function to print Stage 1 output*/
void print_stage1(field_t data[][MAXCOLS], int nrows, field_t headers[]) {
printf("Stage 1\n");
printf("input tsv data has %d rows and %d columns\n", nrows, MAXCOLS);
printf("row %d is:\n", nrows);
for (int i = 0; i < MAXCOLS; i++) {
printf(" %d: %-10s %s\n", i + 1, headers[i], data[nrows - 1][i]);
}
}
/*function to print the first, middle, and last rows after sorting */
void print_stage2_output(field_t data[][MAXCOLS], int nrows, field_t headers[], int *sort_columns, int num_columns) {
printf("\nStage 2\n");
printf("sorting by \"%s\"", headers[sort_columns[0]]);
for (int i = 1; i < num_columns; i++) {
printf(",\n then by \"%s\"", headers[sort_columns[i]]);
}
printf("\n");
int middle = (nrows + 1) / 2 - 1; /*middle row index (⌈r/2⌉, 0-based index)*/
printf("row 1 is:\n");
for (int j = 0; j < MAXCOLS; j++) {
printf(" %d: %-10s %s\n", j + 1, headers[j], data[0][j]);
}
printf("row %d is:\n", middle + 1);
for (int j = 0; j < MAXCOLS; j++) {
printf(" %d: %-10s %s\n", j + 1, headers[j], data[middle][j]);
}
printf("row %d is:\n", nrows);
for (int j = 0; j < MAXCOLS; j++) {
printf(" %d: %-10s %s\n", j + 1, headers[j], data[nrows - 1][j]);
}
}
/* function to compare two rows based on the sort columns*/
int compare_rows(field_t row1[], field_t row2[], int *sort_columns, int num_columns) {
for (int i = 0; i < num_columns; i++) {
int col = sort_columns[i];
int cmp = strcmp(row1[col], row2[col]);
if (cmp != 0) {
return cmp;
}
}
return 0;
}
/* function to sort the data using insertion sort based on the sort columns*/
void insertion_sort(field_t data[][MAXCOLS], int nrows, int *sort_columns, int num_columns) {
for (int i = 1; i < nrows; i++) {
field_t temp[MAXCOLS];
memcpy(temp, data[i], sizeof(temp));
int j = i - 1;
while (j >= 0 && compare_rows(data[j], temp, sort_columns, num_columns) > 0) {
memcpy(data[j + 1], data[j], sizeof(temp));
j--;
}
memcpy(data[j + 1], temp, sizeof(temp));
}
}
/* function to print the Stage 3 header based on the selected sort columns*/
void print_stage3_header(int *sort_columns, int num_columns, field_t headers[]) {
printf("\nStage 3\n");
printf("------------------\n");
for (int i = 0; i < num_columns; i++) {
printf("%s\n", headers[sort_columns[i]]);
if (i < num_columns - 1) {
printf(" ");
}
}
printf("Count\n");
printf("------------------\n");
}
/*function to tabulate and print data, with proper alignment for the Count column*/
void tabulate_data(field_t data[][MAXCOLS], int nrows, int *sort_columns, int num_columns) {
int longest_entry = 0;
/*find the longest entry for column width*/
for (int i = 0; i < nrows; i++) {
for (int j = 0; j < num_columns; j++) {
int len = strlen(data[i][sort_columns[j]]);
if (len > longest_entry) {
longest_entry = len;
}
}
}
/* print the data with aligned Count column*/
int count = 1;
int first_country = 1;
for (int i = 1; i < nrows; i++) {
if (strcmp(data[i][sort_columns[0]], data[i - 1][sort_columns[0]]) == 0 &&
strcmp(data[i][sort_columns[1]], data[i - 1][sort_columns[1]]) == 0) {
count++;
} else {
if (!first_country) {
printf("\n");
}
if (strcmp(data[i - 1][sort_columns[0]], data[i][sort_columns[0]]) != 0 || first_country) {
printf("%-*s\n", longest_entry + 4, data[i - 1][sort_columns[0]]);
}
printf(" %-*s %5d\n", longest_entry, data[i - 1][sort_columns[1]], count);
count = 1;
first_country = 0;
}
}
printf("%-*s\n", longest_entry + 4, data[nrows - 1][sort_columns[0]]);
printf(" %-*s %5d\n", longest_entry, data[nrows - 1][sort_columns[1]], count);
printf("------------------\n");
printf("ta daa!");
}