Skip to content

union_raw_files

combine_stimulus_files(data_path, matching_pattern, dataset_name)

Combine stimulus files from a given data path matching a specific pattern.

Parameters:

Name Type Description Default
data_path str

Path to the directory containing stimulus files.

required
matching_pattern str

Pattern to match files for combining.

required
Source code in src/data/preprocessing/union_raw_files.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def combine_stimulus_files(
    data_path: Path,
    matching_pattern: str,
    dataset_name: str,
) -> None:
    """
    Combine stimulus files from a given data path matching a specific pattern.

    Args:
        data_path (str): Path to the directory containing stimulus files.
        matching_pattern (str): Pattern to match files for combining.
    """
    stimulus_files = list(data_path.rglob(matching_pattern))

    if not stimulus_files:
        logger.warning(f'No files found matching {matching_pattern} in {data_path}.')
        return

    combined_df = pd.DataFrame()
    for file_ in stimulus_files:
        if 'reading' in str(file_):
            read_df = pd.read_csv(file_, sep='\t')
            read_df['filename'] = f'{file_.name.split("_")[0]}.png'
            read_df['sequence_num'] = -1
            combined_df = pd.concat([combined_df, read_df])
        else:
            quest_df = pd.read_csv(file_)
            quest_df['sequence_num'] = file_.name.split('_')[0].split('-')[-1]
            combined_df = pd.concat([combined_df, quest_df])

    if 'SBSAT' in str(data_path):
        logger.info('Filling in missing values for SBSAT dataset...')
        combined_df['stimulus_type'] = combined_df['filename'].apply(
            lambda x: x.split('-')[1]
        )
        combined_df['is_question'].fillna(False, inplace=True)

        # Create 'question' column by concatenating all 'word' where 'is_question' is True for each filename
        question_map = (
            combined_df[combined_df['is_question']]
            .groupby(['stimulus_type', 'sequence_num'])['word']
            .apply(' '.join)
        )
        combined_df['question'] = combined_df.apply(
            lambda row: question_map.get(
                (row['stimulus_type'], row['sequence_num']), None
            ),  # type: ignore
            axis=1,
        )

    combined_df.to_csv(Path(data_path) / 'combined_stimulus.csv', index=False)
    logger.info(f'Combined stimulus CSV saved: {data_path / "combined_stimulus.csv"}')