union_raw_files

`combine_stimulus_files(data_path, matching_pattern, dataset_name)`

Combine stimulus files from a given data path matching a specific pattern.

Parameters:

Name	Type	Description	Default
`data_path`	`str`	Path to the directory containing stimulus files.	required
`matching_pattern`	`str`	Pattern to match files for combining.	required

Source code in src/data/preprocessing/union_raw_files.py

def combine_stimulus_files(
    data_path: Path,
    matching_pattern: str,
    dataset_name: str,
) -> None:
    """
    Combine stimulus files from a given data path matching a specific pattern.

    Args:
        data_path (str): Path to the directory containing stimulus files.
        matching_pattern (str): Pattern to match files for combining.
    """
    stimulus_files = list(data_path.rglob(matching_pattern))

    if not stimulus_files:
        logger.warning(f'No files found matching {matching_pattern} in {data_path}.')
        return

    combined_df = pd.DataFrame()
    for file_ in stimulus_files:
        if 'reading' in str(file_):
            read_df = pd.read_csv(file_, sep='\t')
            read_df['filename'] = f'{file_.name.split("_")[0]}.png'
            read_df['sequence_num'] = -1
            combined_df = pd.concat([combined_df, read_df])
        else:
            quest_df = pd.read_csv(file_)
            quest_df['sequence_num'] = file_.name.split('_')[0].split('-')[-1]
            combined_df = pd.concat([combined_df, quest_df])

    if 'SBSAT' in str(data_path):
        logger.info('Filling in missing values for SBSAT dataset...')
        combined_df['stimulus_type'] = combined_df['filename'].apply(
            lambda x: x.split('-')[1]
        )
        combined_df['is_question'].fillna(False, inplace=True)

        # Create 'question' column by concatenating all 'word' where 'is_question' is True for each filename
        question_map = (
            combined_df[combined_df['is_question']]
            .groupby(['stimulus_type', 'sequence_num'])['word']
            .apply(' '.join)
        )
        combined_df['question'] = combined_df.apply(
            lambda row: question_map.get(
                (row['stimulus_type'], row['sequence_num']), None
            ),  # type: ignore
            axis=1,
        )

    combined_df.to_csv(Path(data_path) / 'combined_stimulus.csv', index=False)
    logger.info(f'Combined stimulus CSV saved: {data_path / "combined_stimulus.csv"}')