Skip to content

csv_to_latex

Convert the metric CSV from the EyeBench benchmark into a LaTeX table.

build_task_wide_df(all_metrics_data_val, all_metrics_data_test, task)

Given all metrics data for val and test splits, build a wide DataFrame where: - Each row is a model (in MODEL_ORDER) - Each column pair is (metric_val, metric_test) - Values are from the 'All' column for the specified task

Parameters:

Name Type Description Default
all_metrics_data_val dict

Dict mapping metric name to (df_discri_eval, df_reg_eval) tuples for validation

required
all_metrics_data_test dict

Dict mapping metric name to (df_discri_eval, df_reg_eval) tuples for test

required
task str

The data task to extract metrics for

required

Returns:

Type Description
DataFrame

Wide DataFrame with models as rows and metric columns for both val and test

Source code in src/run/multi_run/csv_to_latex.py
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
def build_task_wide_df(
    all_metrics_data_val: dict,
    all_metrics_data_test: dict,
    task: str,
) -> pd.DataFrame:
    """
    Given all metrics data for val and test splits, build a wide DataFrame where:
    - Each row is a model (in MODEL_ORDER)
    - Each column pair is (metric_val, metric_test)
    - Values are from the 'All' column for the specified task

    Args:
        all_metrics_data_val: Dict mapping metric name to (df_discri_eval, df_reg_eval) tuples for validation
        all_metrics_data_test: Dict mapping metric name to (df_discri_eval, df_reg_eval) tuples for test
        task: The data task to extract metrics for

    Returns:
        Wide DataFrame with models as rows and metric columns for both val and test
    """
    # Start with the full list of models
    wide = pd.DataFrame({'Model': MODEL_ORDER_CLASSIFICATION})

    # Determine which metrics apply to this task
    is_regression_task = task in REG_TASKS

    # Process each metric for both val and test
    for metric_name in all_metrics_data_test.keys():
        # Skip regression metric for classification tasks and vice versa
        if is_regression_task and metric_name not in RegrSupportedMetrics:
            continue
        if not is_regression_task and metric_name in RegrSupportedMetrics:
            continue

        # Process validation data
        df_discri_eval_val, df_reg_eval_val = all_metrics_data_val[metric_name]
        if metric_name in RegrSupportedMetrics:
            df_eval_val = df_reg_eval_val.copy()
            if not df_eval_val.empty:
                for ml_col, dl_col in ML_REGRESSION_TO_CLASSIFICATION.items():
                    df_eval_val.loc[df_eval_val['Model'] == ml_col, 'Model'] = dl_col
        else:
            df_eval_val = df_discri_eval_val

        # Process test data
        df_discri_eval_test, df_reg_eval_test = all_metrics_data_test[metric_name]
        if metric_name in RegrSupportedMetrics:
            df_eval_test = df_reg_eval_test.copy()
            if not df_eval_test.empty:
                for ml_col, dl_col in ML_REGRESSION_TO_CLASSIFICATION.items():
                    df_eval_test.loc[df_eval_test['Model'] == ml_col, 'Model'] = dl_col
        else:
            df_eval_test = df_discri_eval_test

        # Extract validation data for this task
        if not df_eval_val.empty:
            task_data_val = df_eval_val[df_eval_val['Data'] == task]
            if not task_data_val.empty:
                col_name = f'{METRICS_LABELS.get(metric_name, metric_name)}_Val'
                subset = task_data_val.set_index('Model')['All'].rename(col_name)
                wide = wide.join(subset, on='Model')

        # Extract test data for this task
        if not df_eval_test.empty:
            task_data_test = df_eval_test[df_eval_test['Data'] == task]
            if not task_data_test.empty:
                col_name = f'{METRICS_LABELS.get(metric_name, metric_name)}_Test'
                subset = task_data_test.set_index('Model')['All'].rename(col_name)
                wide = wide.join(subset, on='Model')

    return wide.fillna('')

build_task_wide_df_by_regime(all_metrics_data, task, eval_type)

Given all metrics data for a single eval split, build a wide DataFrame where: - Each row is a model (in MODEL_ORDER) - Columns are organized by regime (Unseen Reader, Unseen Text, Unseen Both, All) - For each regime, all relevant metrics are shown - Values are from the specified regime columns

Parameters:

Name Type Description Default
all_metrics_data dict

Dict mapping metric name to (df_discri_eval, df_reg_eval) tuples

required
task str

The data task to extract metrics for

required
eval_type str

'val' or 'test'

required

Returns:

Type Description
DataFrame

Wide DataFrame with models as rows and (regime, metric) multi-index columns

Source code in src/run/multi_run/csv_to_latex.py
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
def build_task_wide_df_by_regime(
    all_metrics_data: dict,
    task: str,
    eval_type: str,
) -> pd.DataFrame:
    """
    Given all metrics data for a single eval split, build a wide DataFrame where:
    - Each row is a model (in MODEL_ORDER)
    - Columns are organized by regime (Unseen Reader, Unseen Text, Unseen Both, All)
    - For each regime, all relevant metrics are shown
    - Values are from the specified regime columns

    Args:
        all_metrics_data: Dict mapping metric name to (df_discri_eval, df_reg_eval) tuples
        task: The data task to extract metrics for
        eval_type: 'val' or 'test'

    Returns:
        Wide DataFrame with models as rows and (regime, metric) multi-index columns
    """
    # Start with the full list of models
    wide = pd.DataFrame({'Model': MODEL_ORDER_CLASSIFICATION})

    # Determine which metrics apply to this task
    is_regression_task = task in REG_TASKS

    # Process each metric
    for metric_name in all_metrics_data.keys():
        # Skip regression metric for classification tasks and vice versa
        if is_regression_task and metric_name not in RegrSupportedMetrics:
            continue
        if not is_regression_task and metric_name in RegrSupportedMetrics:
            continue

        # Get the appropriate dataframe
        df_discri_eval, df_reg_eval = all_metrics_data[metric_name]
        if metric_name in RegrSupportedMetrics:
            df_eval = df_reg_eval.copy()
            if not df_eval.empty:
                for ml_col, dl_col in ML_REGRESSION_TO_CLASSIFICATION.items():
                    df_eval.loc[df_eval['Model'] == ml_col, 'Model'] = dl_col
        else:
            df_eval = df_discri_eval

        # Extract data for this task
        if not df_eval.empty:
            task_data = df_eval[df_eval['Data'] == task]
            if not task_data.empty:
                # For each regime column, add it to the wide dataframe
                for source_col, regime_label in REGIME_COLS.items():
                    if source_col in task_data.columns:
                        col_name = f'{regime_label}_{METRICS_LABELS.get(metric_name, metric_name)}'
                        subset = task_data.set_index('Model')[source_col].rename(
                            col_name
                        )
                        wide = wide.join(subset, on='Model')

    return wide.fillna('')

build_wide_df(df_discri_eval, df_reg_eval, include_regression=True)

Given a filtered DataFrame for one eval split, pivot it so that each row is a model (in MODEL_ORDER), each column is a task (in TASK_ORDER), and missing values are kept as empty strings.

Parameters:

Name Type Description Default
df_discri_eval DataFrame

DataFrame with classification metrics

required
df_reg_eval DataFrame

DataFrame with regression metrics

required
include_regression bool

If True, include regression tasks. If False, only classification tasks.

True
Source code in src/run/multi_run/csv_to_latex.py
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
def build_wide_df(
    df_discri_eval: pd.DataFrame,
    df_reg_eval: pd.DataFrame,
    include_regression: bool = True,
) -> pd.DataFrame:
    """
    Given a filtered DataFrame for one eval split, pivot it so that each
    row is a model (in MODEL_ORDER), each column is a task (in TASK_ORDER),
    and missing values are kept as empty strings.

    Args:
        df_discri_eval: DataFrame with classification metrics
        df_reg_eval: DataFrame with regression metrics
        include_regression: If True, include regression tasks. If False, only classification tasks.
    """
    # keep in df_reg_eval only data_tasks which are for regression
    if not df_reg_eval.empty:
        df_reg_eval = df_reg_eval[df_reg_eval['Data'].isin(REG_TASKS)].reset_index(
            drop=True
        )
        # replace some ML columns names in the regression DataFrame using ML_REGRESSION_TO_CLASSIFICATION
        for ml_col, dl_col in ML_REGRESSION_TO_CLASSIFICATION.items():
            df_reg_eval.loc[df_reg_eval['Model'] == ml_col, 'Model'] = dl_col

    # keep in df_discri_eval only data_tasks which are for discrete metrics
    if not df_discri_eval.empty:
        df_discri_eval = df_discri_eval[~df_discri_eval['Data'].isin(REG_TASKS)]

    # concat based on include_regression flag
    if include_regression:
        if not df_discri_eval.empty and not df_reg_eval.empty:
            df_eval = pd.concat([df_discri_eval, df_reg_eval], ignore_index=True)
        elif not df_discri_eval.empty:
            df_eval = df_discri_eval
        elif not df_reg_eval.empty:
            df_eval = df_reg_eval
        else:
            df_eval = pd.DataFrame()
    else:
        df_eval = df_discri_eval

    # start with the full list of models
    wide = pd.DataFrame({'Model': MODEL_ORDER_CLASSIFICATION})

    # flatten the groups into a single ordered list of datasets
    task_order = [ds for grp in GROUPS.values() for ds in grp]

    # Filter task_order based on include_regression flag
    if not include_regression:
        task_order = [ds for ds in task_order if ds not in REG_TASKS]

    if not df_eval.empty:
        for ds in task_order:
            col_name = DATASET_TO_COLUMN.get(ds, ds)
            subset = (
                df_eval[df_eval['Data'] == ds]
                .set_index('Model')['All']
                .rename(col_name)
            )
            # join will insert NaN for models with no data
            wide = wide.join(subset, on='Model')

    return wide.fillna('')

compute_aggregated_results(df_discri_eval, df_reg_eval)

Compute two aggregated versions of the results: 1. Normalized scores: Normalize the scores then take mean across metrics and tasks 2. Ranking: For each metric and task, compute ranking of all models, then average rank

Parameters:

Name Type Description Default
df_discri_eval DataFrame

DataFrame with discrete metrics (accuracy, auroc, etc.)

required
df_reg_eval DataFrame

DataFrame with regression metrics (rmse)

required

Returns:

Type Description
tuple[DataFrame, DataFrame]

tuple of (normalized_agg_df, ranking_agg_df)

Source code in src/run/multi_run/csv_to_latex.py
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
def compute_aggregated_results(
    df_discri_eval: pd.DataFrame, df_reg_eval: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Compute two aggregated versions of the results:
    1. Normalized scores: Normalize the scores then take mean across metrics and tasks
    2. Ranking: For each metric and task, compute ranking of all models, then average rank

    Args:
        df_discri_eval: DataFrame with discrete metrics (accuracy, auroc, etc.)
        df_reg_eval: DataFrame with regression metrics (rmse)

    Returns:
        tuple of (normalized_agg_df, ranking_agg_df)
    """

    # Prepare data for aggregation
    df_reg_eval = df_reg_eval[df_reg_eval['Data'].isin(REG_TASKS)].reset_index(
        drop=True
    )
    df_discri_eval = df_discri_eval[
        ~df_discri_eval['Data'].isin(REG_TASKS)
    ].reset_index(drop=True)

    # Replace some ML column names in the regression DataFrame
    for ml_col, dl_col in ML_REGRESSION_TO_CLASSIFICATION.items():
        df_reg_eval.loc[df_reg_eval['Model'] == ml_col, 'Model'] = dl_col

    # Combine all data
    df_combined = pd.concat([df_discri_eval, df_reg_eval], ignore_index=True)

    # Get all unique model-task combinations
    results_list = []
    rankings_list = []

    # Process each task
    task_order = [ds for grp in GROUPS.values() for ds in grp]

    logger.info(f'Processing {len(task_order)} tasks for aggregation: {task_order}')

    for task in task_order:
        task_data = df_combined[df_combined['Data'] == task].copy()
        if task_data.empty:
            logger.warning(f'No data found for task: {task}')
            continue

        # Get the metric column (should be 'All')
        scores = task_data[['Model', 'All']].copy()
        scores = scores.dropna()

        if scores.empty:
            logger.warning(f'No valid scores found for task: {task}')
            continue

        # Extract numeric values from the string format
        scores['numeric_score'] = scores['All'].apply(extract_numeric_value)
        scores = scores.dropna(subset=['numeric_score'])

        if len(scores) < 2:  # Need at least 2 models for meaningful comparison
            logger.warning(f'Less than 2 valid scores for task: {task}')
            continue

        logger.info(f'Processing task {task} with {len(scores)} models')

        # For RMSE (regression), lower is better, so we invert for normalization
        if task in REG_TASKS:
            # For ranking: rank ascending (lower RMSE = better rank)
            scores['rank'] = scores['numeric_score'].rank(method='min', ascending=True)
            # For normalization: invert RMSE so higher normalized score = better
            max_rmse = scores['numeric_score'].max()
            min_rmse = scores['numeric_score'].min()
            if max_rmse != min_rmse:
                scores['normalized'] = 1 - (scores['numeric_score'] - min_rmse) / (
                    max_rmse - min_rmse
                )
            else:
                scores['normalized'] = 0.5  # All same, assign middle value
        else:
            # For other metrics (accuracy, auroc, etc.), higher is better
            scores['rank'] = scores['numeric_score'].rank(method='min', ascending=False)
            # Normalize to 0-1 range
            max_score = scores['numeric_score'].max()
            min_score = scores['numeric_score'].min()
            if max_score != min_score:
                scores['normalized'] = (scores['numeric_score'] - min_score) / (
                    max_score - min_score
                )
            else:
                scores['normalized'] = 0.5  # All same, assign middle value

        # Add task info
        scores['Task'] = task
        results_list.append(scores[['Model', 'Task', 'normalized']])
        rankings_list.append(scores[['Model', 'Task', 'rank']])

    if not results_list:
        logger.warning('No results to aggregate')
        # Return empty DataFrames if no data
        return pd.DataFrame(), pd.DataFrame()

    # Combine all results
    all_normalized = pd.concat(results_list, ignore_index=True)
    all_rankings = pd.concat(rankings_list, ignore_index=True)

    logger.info(
        f'Aggregating results for {len(all_normalized["Model"].unique())} models across {len(all_normalized["Task"].unique())} tasks'
    )

    # Compute aggregated normalized scores (mean across tasks)
    normalized_agg = all_normalized.groupby('Model')['normalized'].mean().reset_index()
    normalized_agg.columns = ['Model', 'Avg_Normalized_Score']
    normalized_agg = normalized_agg.sort_values('Avg_Normalized_Score', ascending=False)

    # Compute aggregated rankings (mean rank across tasks)
    ranking_agg = all_rankings.groupby('Model')['rank'].mean().reset_index()
    ranking_agg.columns = ['Model', 'Avg_Rank']
    ranking_agg = ranking_agg.sort_values('Avg_Rank', ascending=True)

    logger.info(
        f'Generated aggregated results: {len(normalized_agg)} models in normalized scores, {len(ranking_agg)} models in rankings'
    )

    return normalized_agg, ranking_agg

compute_aggregated_results_across_all_metrics(all_metrics_data)

Compute aggregated results across ALL metrics and tasks.

Parameters:

Name Type Description Default
all_metrics_data dict

Dict mapping metric name to (df_discri_eval, df_reg_eval) tuples

required

Returns:

Type Description
tuple[DataFrame, DataFrame]

tuple of (normalized_agg_df, ranking_agg_df)

Source code in src/run/multi_run/csv_to_latex.py
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
def compute_aggregated_results_across_all_metrics(
    all_metrics_data: dict,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Compute aggregated results across ALL metrics and tasks.

    Args:
        all_metrics_data: Dict mapping metric name to (df_discri_eval, df_reg_eval) tuples

    Returns:
        tuple of (normalized_agg_df, ranking_agg_df)
    """

    results_list = []
    rankings_list = []

    task_order = [ds for grp in GROUPS.values() for ds in grp]

    logger.info(
        f'Computing aggregated results across {len(all_metrics_data)} metrics and {len(task_order)} tasks'
    )

    # Process each metric
    for metric_name, (df_discri_eval, df_reg_eval) in all_metrics_data.items():
        # Prepare data for this metric
        if not df_reg_eval.empty:
            df_reg_eval = df_reg_eval[df_reg_eval['Data'].isin(REG_TASKS)].reset_index(
                drop=True
            )
            # Replace some ML column names in the regression DataFrame
            for ml_col, dl_col in ML_REGRESSION_TO_CLASSIFICATION.items():
                df_reg_eval.loc[df_reg_eval['Model'] == ml_col, 'Model'] = dl_col

        if not df_discri_eval.empty:
            df_discri_eval = df_discri_eval[
                ~df_discri_eval['Data'].isin(REG_TASKS)
            ].reset_index(drop=True)

        # Combine all data for this metric
        if not df_discri_eval.empty and not df_reg_eval.empty:
            df_combined = pd.concat([df_discri_eval, df_reg_eval], ignore_index=True)
        elif not df_discri_eval.empty:
            df_combined = df_discri_eval
        elif not df_reg_eval.empty:
            df_combined = df_reg_eval
        else:
            continue  # Skip if both are empty

        # Process each task for this metric
        for task in task_order:
            task_data = df_combined[df_combined['Data'] == task].copy()
            if task_data.empty:
                continue

            scores = task_data[['Model', 'All']].copy()
            scores = scores.dropna()

            if scores.empty:
                continue

            # Extract numeric values
            scores['numeric_score'] = scores['All'].apply(extract_numeric_value)
            scores = scores.dropna(subset=['numeric_score'])

            if len(scores) < 2:
                continue

            # Determine if higher is better based on the metric, not the task type
            higher_is_better = is_metric_higher_better(metric_name)

            if higher_is_better:
                # For metrics where higher is better (e.g., AUROC, R2)
                scores['rank'] = scores['numeric_score'].rank(
                    method='min', ascending=False
                )
                max_score = scores['numeric_score'].max()
                min_score = scores['numeric_score'].min()
                if max_score != min_score:
                    scores['normalized'] = (scores['numeric_score'] - min_score) / (
                        max_score - min_score
                    )
                else:
                    scores['normalized'] = 0.5
            else:
                # For metrics where lower is better (e.g., RMSE, MAE)
                scores['rank'] = scores['numeric_score'].rank(
                    method='min', ascending=True
                )
                max_val = scores['numeric_score'].max()
                min_val = scores['numeric_score'].min()
                if max_val != min_val:
                    scores['normalized'] = 1 - (scores['numeric_score'] - min_val) / (
                        max_val - min_val
                    )
                else:
                    scores['normalized'] = 0.5

            # Add task and metric info
            scores['Task'] = task
            scores['Metric'] = metric_name
            results_list.append(scores[['Model', 'Task', 'Metric', 'normalized']])
            rankings_list.append(scores[['Model', 'Task', 'Metric', 'rank']])

    if not results_list:
        logger.warning('No results to aggregate across metrics')
        return pd.DataFrame(), pd.DataFrame()

    # Combine all results across all metrics and tasks
    all_normalized = pd.concat(results_list, ignore_index=True)
    all_rankings = pd.concat(rankings_list, ignore_index=True)

    logger.info(
        f'Aggregating results for {len(all_normalized["Model"].unique())} models across {len(all_normalized["Task"].unique())} tasks and {len(all_normalized["Metric"].unique())} metrics'
    )

    # Compute aggregated normalized scores (mean across tasks and metrics)
    normalized_agg = all_normalized.groupby('Model')['normalized'].mean().reset_index()
    normalized_agg.columns = ['Model', 'Avg_Normalized_Score']
    normalized_agg = normalized_agg.sort_values('Avg_Normalized_Score', ascending=False)

    # Compute aggregated rankings (mean rank across tasks and metrics)
    ranking_agg = all_rankings.groupby('Model')['rank'].mean().reset_index()
    ranking_agg.columns = ['Model', 'Avg_Rank']
    ranking_agg = ranking_agg.sort_values('Avg_Rank', ascending=True)

    logger.info(
        f'Generated aggregated results: {len(normalized_agg)} models in normalized scores, {len(ranking_agg)} models in rankings'
    )

    return normalized_agg, ranking_agg

extract_numeric_value(val_str)

Extract numeric value from string format like '65.0 ± 0.0'

Source code in src/run/multi_run/csv_to_latex.py
322
323
324
325
326
327
328
329
330
331
332
333
def extract_numeric_value(val_str):
    """Extract numeric value from string format like '65.0 ± 0.0'"""
    if pd.isna(val_str) or val_str == '' or val_str == '-':
        return np.nan
    val_str = str(val_str)
    try:
        return float(val_str.split(' ±')[0])
    except (ValueError, IndexError):
        try:
            return float(val_str)
        except ValueError:
            return np.nan

find_best_indices(numeric_values, higher_is_better)

Find all indices with the best value (handles ties).

Parameters:

Name Type Description Default
numeric_values Series

Series of numeric values

required
higher_is_better bool

If True, find max values; if False, find min values

required

Returns:

Type Description
list

List of indices with the best value

Source code in src/run/multi_run/csv_to_latex.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
def find_best_indices(numeric_values: pd.Series, higher_is_better: bool) -> list:
    """
    Find all indices with the best value (handles ties).

    Args:
        numeric_values: Series of numeric values
        higher_is_better: If True, find max values; if False, find min values

    Returns:
        List of indices with the best value
    """
    if numeric_values.notna().sum() == 0:
        return []

    if higher_is_better:
        best_value = numeric_values.max()
    else:
        best_value = numeric_values.min()

    # Find all indices with the best value (handles ties)
    best_indices = numeric_values[numeric_values == best_value].index.tolist()
    return best_indices

format_value_with_subscript(value)

Format a value string to use LaTeX subscript for standard deviation.

Converts '65.0 ± 2.3' to '65.0\textsubscript{±2.3}'

Parameters:

Name Type Description Default
value str

String in format 'mean ± std' or just a value

required

Returns:

Type Description
str

Formatted string with std as subscript

Source code in src/run/multi_run/csv_to_latex.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
def format_value_with_subscript(value: str) -> str:
    """
    Format a value string to use LaTeX subscript for standard deviation.

    Converts '65.0 ± 2.3' to '65.0\\textsubscript{±2.3}'

    Args:
        value: String in format 'mean ± std' or just a value

    Returns:
        Formatted string with std as subscript
    """
    if not value or value == '' or value == '-':
        return value

    value_str = str(value)
    if ' ± ' in value_str:
        mean, std = value_str.split(' ± ')
        return f'{mean}\\textsubscript{{±{std}}}'
    return value_str

generate_aggregated_latex_table(normalized_agg, ranking_agg, eval_type)

Generate a LaTeX table showing both aggregated results with feature types.

Source code in src/run/multi_run/csv_to_latex.py
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
def generate_aggregated_latex_table(
    normalized_agg: pd.DataFrame, ranking_agg: pd.DataFrame, eval_type: str
) -> tuple[str, pd.DataFrame]:
    """
    Generate a LaTeX table showing both aggregated results with feature types.
    """
    # Define feature types for each model (hardcoded based on the table provided)

    # Merge the two aggregations
    merged = pd.merge(normalized_agg, ranking_agg, on='Model', how='outer')

    # Format model names
    merged['Model_Display'] = merged['Model'].map(MODEL_TO_COLUMN)

    # Round values for display
    merged['Avg_Normalized_Score'] = merged['Avg_Normalized_Score'].round(3)
    merged['Avg_Rank'] = merged['Avg_Rank'].round(2)

    # Sort by the same order as other tables (MODEL_ORDER_CLASSIFICATION)
    # Create a mapping from model name to its position in MODEL_ORDER_CLASSIFICATION
    model_order_map = {model: i for i, model in enumerate(MODEL_ORDER_CLASSIFICATION)}
    merged['order'] = merged['Model'].map(model_order_map)
    merged = merged.sort_values('order')
    merged = merged.drop(columns=['order'])

    # Find best models (handles ties)
    best_norm_score_indices = find_best_indices(
        merged['Avg_Normalized_Score'], higher_is_better=True
    )
    best_rank_indices = find_best_indices(merged['Avg_Rank'], higher_is_better=False)

    # Build DataFrame representing the LaTeX table contents
    feature_cols = [
        'Layout',
        'Saccade/Fixation',
        'Word-Level',
        'Trial-Level',
        'Linguistic',
        'Embeddings',
    ]
    csv_rows: list[dict[str, object]] = []
    for _, row in merged.iterrows():
        model_key = row['Model']
        display_name = (
            row['Model_Display'] if pd.notna(row['Model_Display']) else model_key
        )
        features = FEATURE_TYPES.get(model_key, {})
        csv_row = {'Model': display_name}
        for feature_col in feature_cols:
            csv_row[feature_col] = features.get(feature_col, '-')
        csv_row['Avg Normalized Score'] = row['Avg_Normalized_Score']
        csv_row['Mean Rank'] = row['Avg_Rank']
        csv_rows.append(csv_row)

    csv_table = pd.DataFrame(csv_rows)

    header = dedent("""
    \\begin{table}[ht]
    \\centering
    \\caption{Feature types used by each model, and aggregated model performance across all benchmark tasks and metrics. \\textbf{Layout} stands for information about the position of the text or fixations on the screen. Eye movement features are divided into three levels of granularity: \\textbf{Saccades/Fixations} (e.g., fixation duration), \\textbf{Words} (e.g., total fixation duration on a given word), and \\textbf{Trial} (e.g., average total fixation duration across all the words during the trial). Text features are divided into: \\textbf{Linguistic} word properties (e.g., word frequency) and contextual word \\textbf{Embeddings} (e.g., RoBERTa embeddings). \\textbf{Average Normalized Score} is the mean of min-max normalized scores across all tasks and metrics (higher is better). \\textbf{Mean Rank} is the mean ranking across all tasks and metrics (lower is better). Best performing model for each aggregation metric is shown in \\textbf{bold}.}
    \\resizebox{\\linewidth}{!}{%
    \\begin{tabular}{l|c|ccc|cc||cc}
    \\toprule
    \\textbf{Model} & \\multicolumn{1}{c|}{\\textbf{Layout}} & \\multicolumn{3}{c|}{\\textbf{Eye movement features}} & \\multicolumn{2}{c||}{\\textbf{Text features}}  & \\multicolumn{2}{c}{\\textbf{Aggregated performance}}\\\\
    &  & \\makecell{Saccade/\\\\Fixation Level} & \\makecell{Word\\\\Level} & \\makecell{Trial\\\\Level} & Linguistic & Embeddings & Avg. Normalized Score$\\uparrow$ & Mean Rank$\\downarrow$\\\\
    \\midrule
    """)

    body = ''
    for idx, row in merged.iterrows():
        model_name = (
            row['Model_Display'] if pd.notna(row['Model_Display']) else row['Model']
        )

        # Get feature types for this model
        features = FEATURE_TYPES[row['Model']]

        # Format scores with bold for best
        if pd.notna(row['Avg_Normalized_Score']):
            norm_score = f'{row["Avg_Normalized_Score"]:.3f}'
            if idx in best_norm_score_indices:
                norm_score = f'\\textbf{{{norm_score}}}'
        else:
            norm_score = '--'

        if pd.notna(row['Avg_Rank']):
            avg_rank = f'{row["Avg_Rank"]:.2f}'
            if idx in best_rank_indices:
                avg_rank = f'\\textbf{{{avg_rank}}}'
        else:
            avg_rank = '--'

        # Build row with feature types and aggregated performance
        body += f'{model_name} & {features["Layout"]} & {features["Saccade/Fixation"]} & {features["Word-Level"]} & {features["Trial-Level"]} & {features["Linguistic"]} & {features["Embeddings"]} & {norm_score} & {avg_rank} \\\\\n'

        # Add horizontal lines after certain model groups
        if row['Model'] in ['Roberta', 'RandomForestMLArgs']:
            body += '\\midrule\n'

    footer = dedent(f"""
    \\bottomrule
    \\end{{tabular}}%
    }}
    \\label{{tab:features-per-model-results-{eval_type}}}
    \\end{{table}}
    """)

    return header + body + footer, csv_table.reset_index(drop=True)

generate_breakdown_tables(df, metric, metric_type)

For each task, generate a LaTeX table showing per-model performance across evaluation regimes: unseen subject, unseen item, unseen both, and all.

Source code in src/run/multi_run/csv_to_latex.py
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
def generate_breakdown_tables(df: pd.DataFrame, metric: str, metric_type: str):
    """
    For each task, generate a LaTeX table showing per-model performance
    across evaluation regimes: unseen subject, unseen item, unseen both, and all.
    """
    eval_cols = [
        'Unseen subject seen item',
        'Seen subject unseen item',
        'Unseen subject unseen item',
        'All',
    ]

    for task_key, task_label in DATASET_TO_COLUMN.items():
        df_task = df[df['Data'] == task_key].copy()

        # Ensure all models from MODEL_ORDER are present
        # Create a dataframe with all models
        all_models_df = pd.DataFrame({'Model': MODEL_ORDER_BY_METRIC_TYPE[metric_type]})

        # Merge with existing data, keeping all models
        if not df_task.empty:
            df_task = df_task[['Model'] + eval_cols]
            df_task = all_models_df.merge(df_task, on='Model', how='left')
        else:
            df_task = all_models_df
            for col in eval_cols:
                df_task[col] = ''

        # Replace NaN with '-'
        df_task[eval_cols] = df_task[eval_cols].fillna('-')

        # Replace empty strings with '-'
        for col in eval_cols:
            df_task[col] = df_task[col].replace('', '-')

        # Save original model names before formatting
        df_task['Model_Original'] = df_task['Model']

        # Format model names
        df_task['Model'] = df_task['Model'].map(MODEL_TO_COLUMN)
        df_task['Model'] = df_task['Model'].fillna(df_task['Model_Original'])

        # Find best model for each evaluation column
        # Determine if higher or lower is better based on metric
        is_higher_better = is_metric_higher_better(metric)

        best_indices = {}
        for col in eval_cols:
            numeric_vals = df_task[col].apply(extract_numeric_value)
            if numeric_vals.notna().any():
                # Find all best indices (handles ties)
                best_idx_list = find_best_indices(numeric_vals, is_higher_better)
                best_indices[col] = best_idx_list
            else:
                best_indices[col] = []

        # Build LaTeX
        header = dedent(
            rf"""
        \begin{{table}}[ht]
        \centering
        \caption{{{METRICS_LABELS[metric]} performance for task: \textbf{{{task_label}}}, broken down by evaluation regime.}}
        \resizebox{{\textwidth}}{{!}}{{%
        \begin{{tabular}}{{l|ccc|c}}
        \toprule
        \textbf{{Model}} & \textbf{{Unseen Reader}} & \textbf{{Unseen Text}} & \textbf{{Unseen Reader \& Text}} & \textbf{{All}} \\
        \midrule
        """
        )

        rows = ''
        for idx, row in df_task.iterrows():
            cells = [str(row['Model'])]

            # Format each evaluation column, bolding if it's the best
            for col in eval_cols:
                val = str(row[col])
                # Format with subscript for std deviation
                val = format_value_with_subscript(val)
                if idx in best_indices[col] and val != '-':
                    val = f'\\textbf{{{val}}}'
                cells.append(val)

            rows += ' & '.join(cells) + ' \\\\\n'
            if row['Model_Original'] in ['Roberta', 'RandomForestMLArgs']:
                rows += '\\midrule\n'

        footer = dedent(f"""
        \\bottomrule
        \\end{{tabular}}%
        }}
        \\label{{tab:task-breakdown-{task_key.lower()}-{metric}}}
        \\end{{table}}
        """)

        # Save to both locations
        latex_content = header + rows + footer
        save_to_both_locations(
            latex_content, f'breakdown/{task_key}_{metric}.tex', is_csv=False
        )
        csv_output = df_task.drop(columns=['Model_Original'], errors='ignore')
        save_to_both_locations(
            csv_output, f'breakdown/{task_key}_{metric}.csv', is_csv=True
        )

generate_combined_table(df_auroc_test, df_rmse_test)

Generate a combined LaTeX table showing AUROC for classification tasks and RMSE for regression tasks in a single table.

The table groups tasks by type: - Classification tasks (Reading Comprehension, Domain Expertise, Claim Verification, Dyslexia Detection) - Regression tasks (Subj. Text Difficulty, Reading Compr. Skill, Vocab. Knowledge)

Parameters:

Name Type Description Default
df_auroc_test DataFrame

DataFrame with AUROC results for test split

required
df_rmse_test DataFrame

DataFrame with RMSE results for test split

required

Returns:

Type Description
tuple[str, DataFrame]

Tuple containing the LaTeX table string and a DataFrame representation

Source code in src/run/multi_run/csv_to_latex.py
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
def generate_combined_table(
    df_auroc_test: pd.DataFrame,
    df_rmse_test: pd.DataFrame,
) -> tuple[str, pd.DataFrame]:
    """
    Generate a combined LaTeX table showing AUROC for classification tasks
    and RMSE for regression tasks in a single table.

    The table groups tasks by type:
    - Classification tasks (Reading Comprehension, Domain Expertise, Claim Verification, Dyslexia Detection)
    - Regression tasks (Subj. Text Difficulty, Reading Compr. Skill, Vocab. Knowledge)

    Args:
        df_auroc_test: DataFrame with AUROC results for test split
        df_rmse_test: DataFrame with RMSE results for test split

    Returns:
        Tuple containing the LaTeX table string and a DataFrame representation
    """

    all_tasks = classification_tasks + regression_tasks

    # Prepare RMSE data: replace ML model names with DL equivalents
    df_rmse_processed = df_rmse_test.copy()
    for ml_col, dl_col in ML_REGRESSION_TO_CLASSIFICATION.items():
        df_rmse_processed.loc[df_rmse_processed['Model'] == ml_col, 'Model'] = dl_col

    # Build wide dataframe with all models
    wide = pd.DataFrame({'Model': MODEL_ORDER_CLASSIFICATION})

    # Add AUROC values for classification tasks
    for task in classification_tasks:
        task_data = df_auroc_test[df_auroc_test['Data'] == task]
        if not task_data.empty:
            subset = task_data.set_index('Model')['All'].rename(task)
            wide = wide.join(subset, on='Model')

    # Add RMSE values for regression tasks
    for task in regression_tasks:
        task_data = df_rmse_processed[df_rmse_processed['Data'] == task]
        if not task_data.empty:
            subset = task_data.set_index('Model')['All'].rename(task)
            wide = wide.join(subset, on='Model')

    # Replace NaN with empty string
    wide = wide.fillna('')

    # Find best model for each task
    best_by_task = {}
    for task in all_tasks:
        if task not in wide.columns:
            continue

        numeric_values = wide[task].apply(extract_numeric_value)

        # For regression tasks, use RMSE (lower is better); for classification, use AUROC (higher is better)
        if task in regression_tasks:
            higher_is_better = is_metric_higher_better('rmse')
        else:
            higher_is_better = is_metric_higher_better('auroc')

        # Find all best indices (handles ties)
        best_indices = find_best_indices(numeric_values, higher_is_better)
        if best_indices:
            best_by_task[task] = [wide.loc[idx, 'Model'] for idx in best_indices]

    # Build table header
    num_classification = len(classification_tasks)
    num_regression = len(regression_tasks)

    col_fmt = 'l|' + 'c' * num_classification + '|' + 'c' * num_regression

    # Build second header row with task types
    header_row2 = ' '

    # Mapping of special commands per task
    special_cmds_before = {
        'OneStop_RC': r'',
        'SBSAT_RC': r'\newsetup{}',
        'PoTeC_RC': r'\newthing{}',
        'PoTeC_DE': r'\newthing{}',
        'IITBHGC_CV': r'\newthing{}',
        'CopCo_TYP': r'',
        'SB-SAT_REGR': r'',
        'CopCo_REGR': r'',
        'MECO_L2_REGR': r'\newthing{}',
    }
    special_cmds_after = {
        'OneStop_RC': r'',
        'SBSAT_RC': r'',
        'PoTeC_RC': r'',
        'PoTeC_DE': r'\woman{}+\page{}',
        'IITBHGC_CV': r'\woman{}+\page{}',
        'CopCo_TYP': r'\woman{}',
        'SB-SAT_REGR': r'\woman{}+\page{}',
        'CopCo_REGR': r'\woman{}',
        'MECO_L2_REGR': r'\woman{}',
    }

    # Reading Comprehension tasks (first 3 classification tasks)
    reading_compr_tasks = ['OneStop_RC', 'SBSAT_RC', 'PoTeC_RC']
    num_reading_compr = len(
        [t for t in reading_compr_tasks if t in classification_tasks]
    )
    if num_reading_compr > 0:
        header_row2 += f' & \\multicolumn{{{num_reading_compr}}}{{c}}{{Reading Comprehension\\woman{{}}+\\page{{}}}}'

    # Other classification tasks
    other_class_tasks = ['PoTeC_DE', 'IITBHGC_CV', 'CopCo_TYP']
    for task in other_class_tasks:
        if task in classification_tasks:
            label, _ = task_headers.get(task, ('Unknown', 'Unknown'))
            cmd = special_cmds_before.get(task, '')
            cmd2 = special_cmds_after.get(task, '')
            if cmd:
                label = f'{cmd} {label} {cmd2}'
            header_row2 += f' & \\makecell{{{label}}}'

    # Regression tasks
    for task in regression_tasks:
        label, _ = task_headers.get(task, ('Unknown', 'Unknown'))
        cmd = special_cmds_before.get(task, '')
        cmd2 = special_cmds_after.get(task, '')
        if cmd:
            label = f'{cmd} {label} {cmd2}'
        header_row2 += f' & \\makecell{{{label}}}'

    header_row2 += '\\\\\n'

    # Build third header row with dataset names
    header_row3 = ' '
    for task in classification_tasks + regression_tasks:
        _, dataset = task_headers.get(task, ('Unknown', 'Unknown'))
        header_row3 += f' & {dataset}'
    header_row3 += '\\\\\n'

    header = dedent(f"""
    \\begin{{table}}[ht]
    \\centering
    \\caption{{Model performance across the benchmark tasks and datasets on test data. \\textbf{{AUROC}} (higher is better) for classification tasks and \\textbf{{RMSE}} (lower is better) for regression tasks. The best performing model per task and dataset is shown in \\textbf{{bold}}. Reported values indicate mean~$\\pm$~standard error across folds. The tasks belong to two categories, where \\woman{{}} indicates a reader characteristic prediction task and \\woman{{}}+\\page{{}} an interaction of a reader with a text. \\newthing{{}} indicates new tasks and task-dataset combinations introduced in \\benchmarkname. \\newsetup{{}} indicates a new experimental setup for the task-dataset combination.}}
    \\resizebox{{\\textwidth}}{{!}}{{%
    \\begin{{tabular}}{{@{{}}{col_fmt}@{{}}}}
    \\toprule
     & \\multicolumn{{{num_classification}}}{{c|}}{{\\textbf{{Classification (AUROC$\\uparrow$)}}}} & \\multicolumn{{{num_regression}}}{{c}}{{\\textbf{{Regression (RMSE$\\downarrow$)}}}} \\\\
    \\addlinespace{header_row2}{header_row3}\\midrule
    \\addlinespace
    """)

    # Build table body
    body = ''
    models_in_table: list[str] = []
    for _, row in wide.iterrows():
        model_name = str(row['Model'])

        # Skip models with no data - check if all task values are empty
        has_data = False
        for task in all_tasks:
            if task in wide.columns:
                val_str = str(row[task])
                if val_str and val_str != '' and val_str != 'nan':
                    has_data = True
                    break

        if not has_data:
            continue

        models_in_table.append(model_name)

        # First column: model name
        model_label = MODEL_TO_COLUMN.get(model_name, model_name)
        if model_label is None:
            model_label = model_name
        cells: list[str] = [model_label]

        # Add values for each task (only tasks with data)
        for task in classification_tasks + regression_tasks:
            if task not in wide.columns:
                cells.append('-')
            else:
                value = str(row[task])
                if value == '' or value == 'nan':
                    cells.append('-')
                else:
                    # Format with subscript for std deviation
                    if ' ± ' in value:
                        mean, std = value.split(' ± ')
                        formatted_value = f'{mean}\\textsubscript{{±{std}}}'
                    else:
                        formatted_value = value

                    # Bold if one of the best models for this task
                    if model_name in best_by_task.get(task, []):
                        formatted_value = f'\\textbf{{{formatted_value}}}'

                    cells.append(formatted_value)

        body += ' & '.join(cells) + '\\\\\n'

        # Add horizontal lines after certain model groups
        if model_name in ['Roberta', 'RandomForestMLArgs']:
            body += '\\addlinespace[1ex]\n\\hline\n\\addlinespace[1ex]\n'

    # Build table footer
    footer = dedent("""
    \\bottomrule
    \\end{tabular}%
    }
    \\small
    \\label{tab:task-results-combined}
    \\end{table}
    """)

    latex_table = header + body + footer

    wide_filtered = wide[wide['Model'].isin(models_in_table)].reset_index(drop=True)
    csv_df = prepare_dataframe_for_csv(wide_filtered)

    return latex_table, csv_df

generate_latex_table(wide, eval_type, discrete_metric, reg_metric='Not regression', include_regression=True)

Produce a complete LaTeX table string for a given eval split ('test' or 'val') using the wide DataFrame.

Parameters:

Name Type Description Default
wide DataFrame

Wide DataFrame with model results

required
eval_type str

'test' or 'val'

required
discrete_metric str

Name of the discrete metric (accuracy, auroc, etc.)

required
reg_metric str

Label for regression metric

'Not regression'
include_regression bool

If True, include regression tasks. If False, only classification.

True
Source code in src/run/multi_run/csv_to_latex.py
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
def generate_latex_table(
    wide: pd.DataFrame,
    eval_type: str,
    discrete_metric: str,
    reg_metric: str = 'Not regression',
    include_regression: bool = True,
) -> str:
    """
    Produce a complete LaTeX table string for a given eval split
    ('test' or 'val') using the wide DataFrame.

    Args:
        wide: Wide DataFrame with model results
        eval_type: 'test' or 'val'
        discrete_metric: Name of the discrete metric (accuracy, auroc, etc.)
        reg_metric: Label for regression metric
        include_regression: If True, include regression tasks. If False, only classification.
    """

    # Get actual tasks that have data from the wide dataframe columns
    # (excluding the 'Model' column)
    available_columns = [col for col in wide.columns if col != 'Model']

    # Only include tasks that are actually in the dataframe
    task_order = []
    for ds in [ds for grp in GROUPS.values() for ds in grp]:
        col_name = DATASET_TO_COLUMN.get(ds, ds)
        if col_name in available_columns:
            # Check if column has any non-empty data
            has_data = (wide[col_name] != '').any()
            if has_data:
                task_order.append(ds)

    if not task_order:
        # No tasks to display
        return ''

    # build column-format, inserting '|' before each group block
    special_split_point = 'CopCo_TYP'
    col_fmt = ['l']
    col_fmt.append('|')  # vertical line between groups

    for grp_name, grp_tasks in GROUPS.items():
        # Only include tasks from this group that are in task_order
        tasks_in_group = [t for t in grp_tasks if t in task_order]

        for i, task in enumerate(tasks_in_group):
            if task == special_split_point:
                col_fmt.append(':')  # placeholder for dashed line logic
            col_fmt.append('c')

        if tasks_in_group:  # Only add separator if group has tasks
            col_fmt.append('|')  # vertical line between groups

    col_fmt = ''.join(col_fmt)

    # Determine metric direction for caption
    metric_name = METRICS_LABELS.get(discrete_metric, discrete_metric)

    # Determine if this is a regression-only or classification-only table
    has_classification_tasks = any(ds not in REG_TASKS for ds in task_order)
    has_regression_tasks = any(ds in REG_TASKS for ds in task_order)

    if has_regression_tasks and not has_classification_tasks:
        # Regression-only table
        metric_direction = f'Lower {reg_metric} values indicate better performance'
        task_desc = f'\\textbf{{{reg_metric}}} values are presented for all tasks'
    elif has_classification_tasks and not has_regression_tasks:
        # Classification-only table
        metric_direction = f'Higher {metric_name} values indicate better performance'
        task_desc = f'\\textbf{{{metric_name}}} values are presented for all tasks'

    # header: caption, resizebox, begin tabular
    hdr = dedent(f"""
    \\begin{{table}}[ht]
    \\centering
    \\caption{{Model performance across benchmark tasks grouped into Reader and Reader \\& Text categories. {task_desc}, averaged across folds. {metric_direction}. Best performing model per task is shown in \\textbf{{bold}}.}}
    \\resizebox{{\\textwidth}}{{!}}{{%
    \\begin{{tabular}}{{@{{}}{col_fmt}@{{}}}}
    \\toprule
    \\multirow{{2}}{{*}}{{\\textbf{{Method}}}}""")

    # multicolumn headers
    for grp_name, ds_list in GROUPS.items():
        # Filter to only tasks in this group that are in task_order
        tasks_in_group = [t for t in ds_list if t in task_order]
        if tasks_in_group:
            hdr += f' & \\multicolumn{{{len(tasks_in_group)}}}{{c|}}{{\\textbf{{{grp_name}}}}}'
    hdr += ' \\\\\n'

    # cmidrules
    cmid = []
    offset = 2
    for ds_list in GROUPS.values():
        # Filter to only tasks in this group that are in task_order
        tasks_in_group = [t for t in ds_list if t in task_order]
        if tasks_in_group:
            end = offset + len(tasks_in_group) - 1
            cmid.append(f'\\cmidrule(lr){{{offset}-{end}}}')
            offset = end + 1
    hdr += ' '.join(cmid) + '\n'

    # second header row: task acronyms
    hdr += ' & ' + ' & '.join(
        f'\\textbf{{{DATASET_TO_COLUMN[ds]}}}' for ds in task_order
    )
    hdr += ' \\\\\n\\midrule\n'
    formatted_hline = '\\addlinespace[1ex]\n\\hline\n\\addlinespace[1ex]\n'
    hdr += formatted_hline

    # Find best model for each task
    best_by_task = {}
    for ds in task_order:
        col_name = DATASET_TO_COLUMN[ds]
        if col_name not in wide.columns:
            continue

        # Extract numeric values for comparison
        numeric_values = wide[col_name].apply(extract_numeric_value)

        # Determine metric for this task
        if ds in REG_TASKS:
            # For regression tasks, use reg_metric to determine direction
            higher_is_better = is_metric_higher_better(reg_metric)
        else:
            # For classification tasks, use discrete_metric to determine direction
            higher_is_better = is_metric_higher_better(discrete_metric)

        # Find all best indices (handles ties)
        best_indices = find_best_indices(numeric_values, higher_is_better)
        if best_indices:
            best_by_task[col_name] = [wide.loc[idx, 'Model'] for idx in best_indices]

    # body rows
    body = ''
    for _, row in wide.iterrows():
        first_cell = MODEL_TO_COLUMN[row['Model']]
        cells = [first_cell]

        for ds in task_order:
            col_name = DATASET_TO_COLUMN[ds]
            value = str(row[col_name])

            # Format with subscript for std deviation
            value = format_value_with_subscript(value)

            # Bold if this is one of the best models for this task
            if (
                row['Model'] in best_by_task.get(col_name, [])
                and value != ''
                and value != '-'
            ):
                value = f'\\textbf{{{value}}}'

            cells.append(value)

        body += ' & '.join(cells)
        body += ' \\\\\n'
        if row['Model'] in ['Roberta', 'RandomForestMLArgs']:
            body += formatted_hline

    # table tail
    tail = dedent(rf"""
    \bottomrule
    \end{{tabular}}%
    }}
    \small
    \label{{tab:task-results-{eval_type}-{discrete_metric}}}
    \end{{table}}
    """)

    return hdr + body + tail

generate_latex_table_per_task(wide, task)

Produce a complete LaTeX table string for a specific task with metrics as columns, showing both validation and test results side-by-side.

Parameters:

Name Type Description Default
wide DataFrame

Wide DataFrame with model results (rows=models, cols=metrics with _Val and _Test suffixes)

required
task str

The data task key (e.g., 'CopCo_RCS')

required
Source code in src/run/multi_run/csv_to_latex.py
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
def generate_latex_table_per_task(
    wide: pd.DataFrame,
    task: str,
) -> str:
    """
    Produce a complete LaTeX table string for a specific task with metrics as columns,
    showing both validation and test results side-by-side.

    Args:
        wide: Wide DataFrame with model results (rows=models, cols=metrics with _Val and _Test suffixes)
        task: The data task key (e.g., 'CopCo_RCS')
    """

    if wide.empty:
        return ''

    # Get metric columns (all columns except 'Model')
    all_cols = [col for col in wide.columns if col != 'Model']

    if not all_cols:
        return ''

    # Group columns by metric (removing _Val and _Test suffixes)
    metric_names = []
    for col in all_cols:
        if col.endswith('_Val'):
            metric_name = col[:-4]
            if metric_name not in metric_names:
                metric_names.append(metric_name)
        elif col.endswith('_Test'):
            metric_name = col[:-5]
            if metric_name not in metric_names:
                metric_names.append(metric_name)

    # Build column format: one column for model name, then 2 columns per metric (val, test)
    num_metric_cols = sum(
        1 for m in metric_names if f'{m}_Val' in all_cols or f'{m}_Test' in all_cols
    )
    col_fmt = 'l|' + 'cc|' * num_metric_cols

    # Get task display name
    task_label = DATASET_TO_COLUMN.get(task, task)

    # Header: caption, resizebox, begin tabular
    hdr = dedent(f"""
    \\begin{{table}}[ht]
    \\centering
    \\caption{{Model performance on \\textbf{{{task_label}}} task across different metrics for validation and test sets, averaged across folds. Best performing model per metric and split is shown in \\textbf{{bold}}.}}
    \\resizebox{{\\textwidth}}{{!}}{{%
    \\begin{{tabular}}{{@{{}}{col_fmt}@{{}}}}
    \\toprule
    \\multirow{{2}}{{*}}{{\\textbf{{Method}}}}""")

    # Add metric multicolumn headers
    for metric_name in metric_names:
        has_val = f'{metric_name}_Val' in all_cols
        has_test = f'{metric_name}_Test' in all_cols
        if has_val or has_test:
            hdr += f' & \\multicolumn{{2}}{{c|}}{{\\textbf{{{metric_name}}}}}'
    hdr += ' \\\\\n'

    # Add cmidrules for each metric group
    cmid = []
    offset = 2
    for metric_name in metric_names:
        has_val = f'{metric_name}_Val' in all_cols
        has_test = f'{metric_name}_Test' in all_cols
        if has_val or has_test:
            end = offset + 1
            cmid.append(f'\\cmidrule(lr){{{offset}-{end}}}')
            offset = end + 1
    hdr += ' '.join(cmid) + '\n'

    # Second header row: Val/Test labels
    hdr += ' '
    for metric_name in metric_names:
        has_val = f'{metric_name}_Val' in all_cols
        has_test = f'{metric_name}_Test' in all_cols
        if has_val or has_test:
            hdr += ' & \\textbf{Val} & \\textbf{Test}'
    hdr += ' \\\\\n\\midrule\n'

    # Find best model for each metric column
    best_by_col = {}
    for col in all_cols:
        # Extract numeric values for comparison
        numeric_values = wide[col].apply(extract_numeric_value)

        # Extract metric name from column (format: {Metric}_Val or {Metric}_Test)
        if col.endswith('_Val') or col.endswith('_Test'):
            metric_name = col.rsplit('_', 1)[0]
        else:
            metric_name = col

        # Determine if higher is better for this metric
        higher_is_better = is_metric_higher_better(metric_name)

        # Find all best indices (handles ties)
        best_indices = find_best_indices(numeric_values, higher_is_better)
        if best_indices:
            best_by_col[col] = [wide.loc[idx, 'Model'] for idx in best_indices]

    # Body rows
    body = ''
    for _, row in wide.iterrows():
        first_cell = MODEL_TO_COLUMN[row['Model']]
        cells = [first_cell]

        for metric_name in metric_names:
            val_col = f'{metric_name}_Val'
            test_col = f'{metric_name}_Test'

            # Add validation value
            if val_col in all_cols:
                val_value = str(row[val_col])
                # Format with subscript for std deviation
                val_value = format_value_with_subscript(val_value)
                if (
                    row['Model'] in best_by_col.get(val_col, [])
                    and val_value != ''
                    and val_value != '-'
                ):
                    val_value = f'\\textbf{{{val_value}}}'
                cells.append(val_value)
            else:
                cells.append('')

            # Add test value
            if test_col in all_cols:
                test_value = str(row[test_col])
                # Format with subscript for std deviation
                test_value = format_value_with_subscript(test_value)
                if (
                    row['Model'] in best_by_col.get(test_col, [])
                    and test_value != ''
                    and test_value != '-'
                ):
                    test_value = f'\\textbf{{{test_value}}}'
                cells.append(test_value)
            else:
                cells.append('')

        body += ' & '.join(cells)
        body += ' \\\\\n'
        if row['Model'] in ['Roberta', 'RandomForestMLArgs']:
            body += '\\midrule\n'

    # Table tail
    tail = dedent(rf"""
    \bottomrule
    \end{{tabular}}%
    }}
    \label{{tab:task-{task.lower()}}}
    \end{{table}}
    """)

    return hdr + body + tail

generate_latex_table_per_task_by_regime(wide, task, eval_type)

Produce a complete LaTeX table string for a specific task, broken down by regime, with all metrics shown for each regime.

Parameters:

Name Type Description Default
wide DataFrame

Wide DataFrame with model results (rows=models, cols=(regime, metric) combinations)

required
task str

The data task key (e.g., 'CopCo_RCS')

required
eval_type str

'val' or 'test'

required
Source code in src/run/multi_run/csv_to_latex.py
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
def generate_latex_table_per_task_by_regime(
    wide: pd.DataFrame,
    task: str,
    eval_type: str,
) -> str:
    """
    Produce a complete LaTeX table string for a specific task, broken down by regime,
    with all metrics shown for each regime.

    Args:
        wide: Wide DataFrame with model results (rows=models, cols=(regime, metric) combinations)
        task: The data task key (e.g., 'CopCo_RCS')
        eval_type: 'val' or 'test'
    """

    if wide.empty:
        return ''

    # Get all columns except 'Model'
    all_cols = [col for col in wide.columns if col != 'Model']

    if not all_cols:
        return ''

    # Organize columns by regime and metric
    # Column format is: {Regime}_{Metric}
    regimes = ['Unseen Reader', 'Unseen Text', 'Unseen Text \\& Reader', 'Average']

    # Extract unique metrics from columns
    metrics = []
    for col in all_cols:
        for regime in regimes:
            if col.startswith(f'{regime}_'):
                metric = col[len(regime) + 1 :]
                if metric not in metrics:
                    metrics.append(metric)

    # Build column format: one column for model name, then columns for each regime group
    # Each regime has one column per metric
    col_fmt_parts = ['l|']
    for regime in regimes:
        regime_metrics = [m for m in metrics if f'{regime}_{m}' in all_cols]
        if regime_metrics:
            col_fmt_parts.append('c' * len(regime_metrics))
            col_fmt_parts.append('|')
    col_fmt = ''.join(col_fmt_parts)

    # Get task display name
    task_label = DATASET_TO_COLUMN.get(task, task)

    # Determine eval type label
    eval_label = 'validation' if eval_type == 'val' else 'test'

    # Header: caption, resizebox, begin tabular
    hdr = dedent(f"""
    \\begin{{table}}[ht]
    \\centering
    \\caption{{Model performance on the {task_label} for the \\textbf{{{eval_label}}} set.}}
    \\resizebox{{\\textwidth}}{{!}}{{%
    \\begin{{tabular}}{{@{{}}{col_fmt}@{{}}}}
    \\toprule
    \\multirow{{2}}{{*}}{{\\textbf{{Method}}}}""")

    # Add regime multicolumn headers
    for regime in regimes:
        regime_metrics = [m for m in metrics if f'{regime}_{m}' in all_cols]
        if regime_metrics:
            hdr += f' & \\multicolumn{{{len(regime_metrics)}}}{{c|}}{{\\textbf{{{regime}}}}}'
    hdr += ' \\\\\n'

    # Add cmidrules for each regime group
    cmid = []
    offset = 2
    for regime in regimes:
        regime_metrics = [m for m in metrics if f'{regime}_{m}' in all_cols]
        if regime_metrics:
            end = offset + len(regime_metrics) - 1
            cmid.append(f'\\cmidrule(lr){{{offset}-{end}}}')
            offset = end + 1
    hdr += ' '.join(cmid) + '\n'

    # Second header row: metric labels
    hdr += ' '
    for regime in regimes:
        regime_metrics = [m for m in metrics if f'{regime}_{m}' in all_cols]
        for metric in regime_metrics:
            hdr += f' & \\textbf{{{metric}}}'
    hdr += ' \\\\\n\\midrule\n'

    # Find best model for each (regime, metric) column
    best_by_col = {}
    for col in all_cols:
        # Extract numeric values for comparison
        numeric_values = wide[col].apply(extract_numeric_value)

        # Extract metric name from column (format: {Regime}_{Metric})
        for regime in regimes:
            if col.startswith(f'{regime}_'):
                metric_name = col[len(regime) + 1 :]
                break
        else:
            metric_name = col

        # Determine if higher is better for this metric
        higher_is_better = is_metric_higher_better(metric_name)

        # Find all best indices (handles ties)
        best_indices = find_best_indices(numeric_values, higher_is_better)
        if best_indices:
            best_by_col[col] = [wide.loc[idx, 'Model'] for idx in best_indices]

    # Body rows
    body = ''
    for _, row in wide.iterrows():
        first_cell = MODEL_TO_COLUMN[row['Model']]
        cells = [first_cell]

        for regime in regimes:
            regime_metrics = [m for m in metrics if f'{regime}_{m}' in all_cols]
            for metric in regime_metrics:
                col = f'{regime}_{metric}'
                value = str(row[col])

                # Format with subscript for std deviation
                value = format_value_with_subscript(value)

                # Bold if this is one of the best models for this regime+metric
                if (
                    row['Model'] in best_by_col.get(col, [])
                    and value != ''
                    and value != '-'
                ):
                    value = f'\\textbf{{{value}}}'

                cells.append(value)

        body += ' & '.join(cells)
        body += ' \\\\\n'
        if row['Model'] in ['Roberta', 'RandomForestMLArgs']:
            body += '\\midrule\n'

    # Table tail
    tail = dedent(rf"""
    \bottomrule
    \end{{tabular}}%
    }}
    \label{{tab:task-{task.lower()}-{eval_type}-regime}}
    \end{{table}}
    """)

    return hdr + body + tail

is_metric_higher_better(metric_name)

Determine if higher values are better for a given metric.

Parameters:

Name Type Description Default
metric_name str

The metric name (e.g., 'auroc', 'rmse', 'r2', 'R²')

required

Returns:

Type Description
bool

True if higher is better, False if lower is better

Source code in src/run/multi_run/csv_to_latex.py
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def is_metric_higher_better(metric_name: str) -> bool:
    """
    Determine if higher values are better for a given metric.

    Args:
        metric_name: The metric name (e.g., 'auroc', 'rmse', 'r2', 'R²')

    Returns:
        True if higher is better, False if lower is better
    """
    # Normalize metric name to lowercase for comparison
    metric_lower = metric_name.lower().strip()

    # Metrics where lower is better
    lower_is_better = ['rmse', 'mae']

    # Metrics where higher is better
    higher_is_better = ['auroc', 'accuracy', 'balanced_accuracy', 'f1', 'r2', 'r²']

    if metric_lower in lower_is_better:
        return False
    elif metric_lower in higher_is_better:
        return True

    # Default: if it's a regression metric not in the lists, assume lower is better
    # Otherwise assume higher is better
    if metric_lower in [m.lower() for m in RegrSupportedMetrics]:
        return False
    return True

keep_only_all_eval(df)

Load the AUROC CSV and drop unused columns.

Source code in src/run/multi_run/csv_to_latex.py
458
459
460
461
462
463
464
465
466
467
468
469
470
def keep_only_all_eval(df: pd.DataFrame) -> pd.DataFrame:
    """
    Load the AUROC CSV and drop unused columns.
    """
    df = df.drop(
        columns=[
            'Seen subject unseen item',
            'Unseen subject seen item',
            'Unseen subject unseen item',
        ],
        errors='ignore',
    )
    return df

prepare_dataframe_for_csv(df, model_col='Model')

Return a copy of df with human-friendly model names for CSV export.

Source code in src/run/multi_run/csv_to_latex.py
413
414
415
416
417
418
419
420
421
422
def prepare_dataframe_for_csv(
    df: pd.DataFrame, model_col: str = 'Model'
) -> pd.DataFrame:
    """Return a copy of ``df`` with human-friendly model names for CSV export."""

    csv_df = df.copy()
    if model_col in csv_df.columns:
        mapped = csv_df[model_col].map(MODEL_TO_COLUMN)
        csv_df[model_col] = mapped.fillna(csv_df[model_col])
    return csv_df

save_to_both_locations(content, relative_path, is_csv=False)

Save content to both the local results directory and the external output directory.

Parameters:

Name Type Description Default
content str | DataFrame

The content to write (str for text files, DataFrame for CSV files)

required
relative_path str

The relative path within the results directory

required
is_csv bool

Whether this is a CSV file (handled differently)

False
Source code in src/run/multi_run/csv_to_latex.py
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
def save_to_both_locations(
    content: str | pd.DataFrame, relative_path: str, is_csv: bool = False
):
    """
    Save content to both the local results directory and the external output directory.

    Args:
        content: The content to write (str for text files, DataFrame for CSV files)
        relative_path: The relative path within the results directory
        is_csv: Whether this is a CSV file (handled differently)
    """
    for base_dir in [LOCAL_OUTPUT_DIR, OVERLEAF_OUTPUT_DIR]:
        # If Overleaf dir does not exist, log and skip saving there
        if base_dir == OVERLEAF_OUTPUT_DIR and not base_dir.exists():
            logger.warning(
                f'Overleaf output dir does not exist ({OVERLEAF_OUTPUT_DIR}), skipping save to Overleaf'
            )
            continue

        path = base_dir / relative_path
        path.parent.mkdir(parents=True, exist_ok=True)
        try:
            if is_csv:
                # content expected to be a DataFrame
                content.to_csv(path, index=False)
            else:
                # content expected to be a string
                path.write_text(content)
            logger.info(f'Saved to: {path}')
        except Exception as e:
            logger.exception(f'Failed to save {relative_path} to {base_dir}: {e}')