Combine stimulus files from a given data path matching a specific pattern.
Parameters:
| Name |
Type |
Description |
Default |
data_path
|
str
|
Path to the directory containing stimulus files.
|
required
|
matching_pattern
|
str
|
Pattern to match files for combining.
|
required
|
Source code in src/data/preprocessing/union_raw_files.py
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122 | def combine_stimulus_files(
data_path: Path,
matching_pattern: str,
dataset_name: str,
) -> None:
"""
Combine stimulus files from a given data path matching a specific pattern.
Args:
data_path (str): Path to the directory containing stimulus files.
matching_pattern (str): Pattern to match files for combining.
"""
stimulus_files = list(data_path.rglob(matching_pattern))
if not stimulus_files:
logger.warning(f'No files found matching {matching_pattern} in {data_path}.')
return
combined_df = pd.DataFrame()
for file_ in stimulus_files:
if 'reading' in str(file_):
read_df = pd.read_csv(file_, sep='\t')
read_df['filename'] = f'{file_.name.split("_")[0]}.png'
read_df['sequence_num'] = -1
combined_df = pd.concat([combined_df, read_df])
else:
quest_df = pd.read_csv(file_)
quest_df['sequence_num'] = file_.name.split('_')[0].split('-')[-1]
combined_df = pd.concat([combined_df, quest_df])
if 'SBSAT' in str(data_path):
logger.info('Filling in missing values for SBSAT dataset...')
combined_df['stimulus_type'] = combined_df['filename'].apply(
lambda x: x.split('-')[1]
)
combined_df['is_question'].fillna(False, inplace=True)
# Create 'question' column by concatenating all 'word' where 'is_question' is True for each filename
question_map = (
combined_df[combined_df['is_question']]
.groupby(['stimulus_type', 'sequence_num'])['word']
.apply(' '.join)
)
combined_df['question'] = combined_df.apply(
lambda row: question_map.get(
(row['stimulus_type'], row['sequence_num']), None
), # type: ignore
axis=1,
)
combined_df.to_csv(Path(data_path) / 'combined_stimulus.csv', index=False)
logger.info(f'Combined stimulus CSV saved: {data_path / "combined_stimulus.csv"}')
|