In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme()
In [2]:
DATA = Path('..') / 'data'
RAW = DATA / 'raw'
PROCESSED = DATA / 'processed'

Load data and utils¶

In [3]:
def pano_to_long(df):
    pano_long = df.apply(pd.Series.explode)
    pano_long['pov_index'] = pano_long.groupby('task_id').cumcount()
    pano_long = pano_long.rename(columns={'headings': 'heading', 'scores': 'score'})
    return pano_long


pano_scores = pd.read_json(PROCESSED / 'panorama_scores.json')
pano_scores_long = pano_to_long(pano_scores)
img_scores = pd.read_json(PROCESSED / 'image_scores.json')
runs = pd.read_json(PROCESSED / 'runs.json')
users = pd.read_json(PROCESSED / 'users.json')
users[['age_range', 'location', 'gender', 'level_of_education']] = users[['age_range', 'location', 'gender', 'level_of_education']].astype('category')



print('pano_scores:', len(pano_scores))
display(pano_scores.head(3))
print('pano_scores_long:', len(pano_scores_long))
display(pano_scores_long.head(2))
print('img_scores:', len(img_scores))
display(img_scores.head(3))
print('runs:', len(runs))
display(runs.head(3))
print('users:', len(users))
display(users.head(3))
pano_scores: 650
panoid time_started time_finished scores headings familiarity run_id index_in_run task_id run_template_id user_id time
0 ToX0MGNWH-VsUEqq5wSCzQ 2023-02-02T23:00:51.256000Z 2023-02-02T23:02:15.021000Z [2.0, 4.0, 1.5, 1.5] [167, 257, 347, 77] 1 84 3 668 10 75 83
1 _zI03ND3kqbk0lyGr6va_A 2023-02-02T23:02:40.071000Z 2023-02-02T23:03:40.548000Z [4.3, 3.5, 3.3, 3.7] [313, 43, 133, 223] 1 84 6 667 10 75 60
2 vTjGNS9tAM-xTV0j85XRtA 2023-03-14T14:42:07.762000Z 2023-03-14T14:43:08.706000Z [3.5, 3.5, 3.5, 3.5] [352, 82, 172, 262] 1 361 3 2886 3 143 60
pano_scores_long: 2600
panoid time_started time_finished score heading familiarity run_id index_in_run task_id run_template_id user_id time pov_index
0 ToX0MGNWH-VsUEqq5wSCzQ 2023-02-02T23:00:51.256000Z 2023-02-02T23:02:15.021000Z 2.0 167 1 84 3 668 10 75 83 0
0 ToX0MGNWH-VsUEqq5wSCzQ 2023-02-02T23:00:51.256000Z 2023-02-02T23:02:15.021000Z 4.0 257 1 84 3 668 10 75 83 1
img_scores: 1940
panoid heading time_started time_finished score familiarity run_id index_in_run task_id run_template_id user_id time
0 yGy5iCyoO0PbL2JaR9EggQ 346 2023-02-02T23:00:23.057000Z 2023-02-02T23:00:41.429000Z 1.5 1 84 1 666 10 75 18
1 Wv_rNo6f8bMLVKkHdZotfg 142 2023-02-02T23:00:41.512000Z 2023-02-02T23:00:51.156000Z 4.5 1 84 2 671 10 75 9
2 1Lg37smPwqvAXl4bdDqhlw 162 2023-02-02T23:02:15.165000Z 2023-02-02T23:02:28.510000Z 4.5 1 84 4 670 10 75 13
runs: 329
id time_started time_finished run_template_id user_id img_task_ids pano_task_ids time
0 84 2023-02-02T22:59:11.602033Z 2023-02-02T23:03:44.914786Z 10 75 [666, 671, 670, 672, 665, 669] [668, 667] 273
1 361 2023-03-14T14:40:56.078633Z 2023-03-14T14:45:08.342496Z 3 143 [2883, 2890, 2884, 2887, 2885, 2888] [2886, 2889] 252
2 91 2023-02-03T09:44:16.438030Z 2023-02-03T09:49:42.938003Z 10 82 [721, 722, 728, 727, 725, 726] [723, 724] 326
users: 329
id run_ids age_range gender level_of_education location
0 39 [43] 25-29 male doctorate FR
1 20 [23] 25-29 female doctorate FR
2 70 [79] 15-19 female high school FR
In [4]:
PANOIDS_TASKS = list(pano_scores['panoid'].unique())  # all panoids except calibration
PANOIDS_CALIBRATION = list(set(img_scores['panoid'].unique()) - set(PANOIDS_TASKS))
PANOIDS_ALL = PANOIDS_TASKS + PANOIDS_CALIBRATION
print(len(PANOIDS_TASKS))
print(PANOIDS_CALIBRATION)

RUN_TEMPLATE_IDS = sorted(img_scores['run_template_id'].unique())
print(len(RUN_TEMPLATE_IDS))

IMG_TASK_PARAMS = list(img_scores[img_scores['panoid'].isin(PANOIDS_TASKS)].groupby(['panoid', 'heading']).size().index)  # use groupby to get unique pairs of panoid, heading
print(len(IMG_TASK_PARAMS))
print(IMG_TASK_PARAMS)

CALIBRATION_TASK_PARAMS = [('yGy5iCyoO0PbL2JaR9EggQ', 346.0), ('TY6wOkPRysul_e_W73lmnQ', 224.0)]
24
['yGy5iCyoO0PbL2JaR9EggQ', 'TY6wOkPRysul_e_W73lmnQ']
12
48
[('1Lg37smPwqvAXl4bdDqhlw', 162), ('1Lg37smPwqvAXl4bdDqhlw', 342), ('GnReLERfph4NUc7ZCnYYIA', 30), ('GnReLERfph4NUc7ZCnYYIA', 210), ('J6b4uNaJXJePwh3i4g-J9g', 63), ('J6b4uNaJXJePwh3i4g-J9g', 243), ('OES3M70uJAKNvDX2lKkMPQ', 145), ('OES3M70uJAKNvDX2lKkMPQ', 325), ('P4F4QKFaTE5d-QNz_Jx0kg', 158), ('P4F4QKFaTE5d-QNz_Jx0kg', 248), ('PGxyIaP90yNpqgAOQylGog', 170), ('PGxyIaP90yNpqgAOQylGog', 350), ('SQRLJrD2KYe1d2VA6txyig', 11), ('SQRLJrD2KYe1d2VA6txyig', 191), ('ToX0MGNWH-VsUEqq5wSCzQ', 167), ('ToX0MGNWH-VsUEqq5wSCzQ', 347), ('UHt0RLnWk1TJnhkBTp6DeA', 103), ('UHt0RLnWk1TJnhkBTp6DeA', 283), ('Wv_rNo6f8bMLVKkHdZotfg', 142), ('Wv_rNo6f8bMLVKkHdZotfg', 322), ('_zI03ND3kqbk0lyGr6va_A', 43), ('_zI03ND3kqbk0lyGr6va_A', 223), ('aOT4Hl_n33HvBXyWpvYb4Q', 152), ('aOT4Hl_n33HvBXyWpvYb4Q', 332), ('dTb77iHE5hYvcqD26Y99TA', 98), ('dTb77iHE5hYvcqD26Y99TA', 278), ('iDUxUuJHoy4jOx-Yt8laNA', 58), ('iDUxUuJHoy4jOx-Yt8laNA', 238), ('iZ2ARYVKACAF8KFRIHr15w', 75), ('iZ2ARYVKACAF8KFRIHr15w', 165), ('jI40EDTDeCsmBibs1jbXzQ', 35), ('jI40EDTDeCsmBibs1jbXzQ', 215), ('m4kX2Djw5DmJbL40tel9Yw', 125), ('m4kX2Djw5DmJbL40tel9Yw', 305), ('pKtV8k7abhxUSE9JAkZLsA', 8), ('pKtV8k7abhxUSE9JAkZLsA', 188), ('qgpfKJzOZ5OBo5JdvCAp8Q', 152), ('qgpfKJzOZ5OBo5JdvCAp8Q', 332), ('tlPLzx1D7MRgcvFowbmWGw', 128), ('tlPLzx1D7MRgcvFowbmWGw', 308), ('v9YEYuKKwMPo3RZWKewZEQ', 0), ('v9YEYuKKwMPo3RZWKewZEQ', 90), ('vTjGNS9tAM-xTV0j85XRtA', 172), ('vTjGNS9tAM-xTV0j85XRtA', 352), ('x_gO8pWHTNMwQxHg9Xv5Sg', 86), ('x_gO8pWHTNMwQxHg9Xv5Sg', 266), ('ySFr8WwsE0Y1vkN1nZ19Rw', 164), ('ySFr8WwsE0Y1vkN1nZ19Rw', 344)]
In [5]:
def get_pano_img_scores(panoid, heading):
    pano_scores_filtered = pano_scores_long[(pano_scores_long['panoid']==panoid) & (pano_scores_long['heading']==heading)]['score']
    img_scores_filtered = img_scores[(img_scores['panoid']==panoid) & (img_scores['heading']==heading)]['score']
    return pano_scores_filtered, img_scores_filtered

Format data¶

Make a df with columns user_id, pov_id ({panoid}-{heading}), panoid, task_type, score

  • task_type=pano: We already have these columns in pano_scores_long
  • task_type=image: We already have these columns in img_scores.

We just need to extract the relevant data and concatenate, while adding the column task_type

In [6]:
img_scores['run_template_id'].value_counts().sort_index()
Out[6]:
run_template_id
1     223
2     204
3     201
10    126
11    208
12    164
13    144
14    159
15    136
16    124
17    131
18    120
Name: count, dtype: int64
In [7]:
# Extract relevant columns and add task_type
pano_scores_long['task_type'] = 'pano'
img_scores['task_type'] = 'image'

pano_scores_long['is_calibration'] = False
img_scores['is_calibration'] = img_scores['panoid'].isin(PANOIDS_CALIBRATION)

pano_scores_long['heading'] = pano_scores_long['heading'].astype(int)
img_scores['heading'] = img_scores['heading'].astype(int)

# add column pov_id to both df
pano_scores_long['pov_id'] = pano_scores_long['panoid'].astype(str) + '-' + pano_scores_long['heading'].astype(str)
img_scores['pov_id'] = img_scores['panoid'].astype(str) + '-' + img_scores['heading'].astype(str)

# add a boolean column has_counterpart if the pov_id has a counterpart in the other df
pano_scores_long['has_counterpart'] = pano_scores_long['pov_id'].isin(img_scores['pov_id'])
img_scores['has_counterpart'] = img_scores['pov_id'].isin(pano_scores_long['pov_id'])

playlist_to_pano_group_mapping = {
    1: 'A',
    2: 'A',
    3: 'A',
    10: 'B',
    11: 'B',
    12: 'B',
    13: 'C',
    14: 'C',
    15: 'C',
    16: 'D',
    17: 'D',
    18: 'D'
}

# extract relevant columns and concat
columns = ['user_id', 'pov_id', 'panoid', 'heading', 'task_type', 'has_counterpart', 'is_calibration', 'run_template_id', 'score']
final_df = pd.concat([pano_scores_long[columns], img_scores[columns]], ignore_index=True)
final_df.rename(columns={'run_template_id': 'playlist_id'}, inplace=True)
final_df['panorama_group'] = final_df['playlist_id'].map(playlist_to_pano_group_mapping)
final_df.head()
Out[7]:
user_id pov_id panoid heading task_type has_counterpart is_calibration playlist_id score panorama_group
0 75 ToX0MGNWH-VsUEqq5wSCzQ-167 ToX0MGNWH-VsUEqq5wSCzQ 167 pano True False 10 2.0 B
1 75 ToX0MGNWH-VsUEqq5wSCzQ-257 ToX0MGNWH-VsUEqq5wSCzQ 257 pano False False 10 4.0 B
2 75 ToX0MGNWH-VsUEqq5wSCzQ-347 ToX0MGNWH-VsUEqq5wSCzQ 347 pano True False 10 1.5 B
3 75 ToX0MGNWH-VsUEqq5wSCzQ-77 ToX0MGNWH-VsUEqq5wSCzQ 77 pano False False 10 1.5 B
4 75 _zI03ND3kqbk0lyGr6va_A-313 _zI03ND3kqbk0lyGr6va_A 313 pano False False 10 4.3 B
In [8]:
final_df.to_csv(PROCESSED / 'R_dataframe.csv', index=False)

MOS¶

In [9]:
mos = pd.read_json(PROCESSED / 'mos.json')
mos['task_str'] = mos['task_str'].apply(lambda x: x.replace('img__', '').replace('pano__', ''))
mos.head()
Out[9]:
zrec_mean zrec_ci95 ambiguity mean ci95 task_str task_type panoid heading scores unbiased_scores zrec_weights n_scores
0 3.639504 0.461978 1.192592 3.665000 0.536245 1Lg37smPwqvAXl4bdDqhlw__162 img 1Lg37smPwqvAXl4bdDqhlw 162 [4.5, 3.7, 4.3, 3.0, 1.8, 4.5, 1.8, 4.5, 4.0, ... [4.5269848954, 2.2755177064, 3.3382972997, 3.4... [1.6559975051, 2.6094936981, 3.1341151709, 2.3... 20
1 3.214081 0.223306 0.971571 3.144444 0.373453 1Lg37smPwqvAXl4bdDqhlw__342 img 1Lg37smPwqvAXl4bdDqhlw 342 [1.5, 3.0, 4.0, 4.7, 3.7, 2.5, 3.1, 4.0, 3.5, ... [1.8499639157, 3.0755039274, 3.550159749, 3.91... [0.9706302555, 0.7109065127, 4.0378420193, 2.8... 27
2 2.987567 0.294954 0.982113 3.200000 0.420049 GnReLERfph4NUc7ZCnYYIA__30 img GnReLERfph4NUc7ZCnYYIA 30 [2.2, 3.9, 5.0, 3.0, 4.0, 3.0, 1.4, 2.7, 3.0, ... [2.3192773176, 3.3940297921, 3.7309587511, 3.7... [1.413609054, 0.8850909961000001, 2.2580586099... 22
3 3.432103 0.232188 0.749119 3.474074 0.287947 GnReLERfph4NUc7ZCnYYIA__210 img GnReLERfph4NUc7ZCnYYIA 210 [3.3, 3.6, 4.0, 2.0, 3.5, 2.4, 3.5, 3.5, 3.2, ... [2.3220183982, 3.3413230723, 3.9997644147, 2.3... [1.9876567429, 1.3418638823, 2.1868066374, 1.1... 27
4 3.637987 0.215803 1.019651 3.496970 0.353284 J6b4uNaJXJePwh3i4g-J9g__63 img J6b4uNaJXJePwh3i4g-J9g 63 [3.0, 5.0, 4.0, 3.0, 4.3, 4.5, 3.5, 3.6, 2.0, ... [3.2990290282, 4.4552737761, 4.292272107, 3.90... [7.8503033188, 1.1137259141, 2.3090197319, 3.2... 33
In [10]:
def expand_mos_df(df):
    # Select required columns
    cols_to_keep = ['task_str', 'task_type', 'panoid', 'heading', 
                    'scores', 'unbiased_scores', 'zrec_weights']
    
    # Create expanded dataframe
    expanded_df = df[cols_to_keep].copy()
    
    # Explode array columns simultaneously
    expanded_df = expanded_df.apply(lambda x: x.explode() if isinstance(x.iloc[0], (list, np.ndarray)) else x)
    
    # Add score_idx column
    expanded_df['score_idx'] = expanded_df.groupby(['task_str', 'task_type', 'panoid', 'heading']).cumcount()
    
    # Reset index
    expanded_df = expanded_df.reset_index(drop=True)
    
    return expanded_df

# Create new DataFrame
new_df = expand_mos_df(mos)

new_df = new_df.rename(columns={'scores': 'score', 'unbiased_scores': 'unbiased_score', 'zrec_weights': 'zrec_weight', 'task_str': 'pov_id'})

new_df['is_calibration'] = new_df['panoid'].isin(PANOIDS_CALIBRATION)
IMG_TASK_PARAMS_str = [f'{panoid}__{heading}' for panoid, heading in IMG_TASK_PARAMS]
new_df['has_counterpart'] = new_df['pov_id'].isin(IMG_TASK_PARAMS_str)
display(new_df.head())

new_df.to_csv(PROCESSED / 'R_mos_dataframe.csv', index=False)
pov_id task_type panoid heading score unbiased_score zrec_weight score_idx is_calibration has_counterpart
0 1Lg37smPwqvAXl4bdDqhlw__162 img 1Lg37smPwqvAXl4bdDqhlw 162 4.5 4.526985 1.655998 0 False True
1 1Lg37smPwqvAXl4bdDqhlw__162 img 1Lg37smPwqvAXl4bdDqhlw 162 3.7 2.275518 2.609494 1 False True
2 1Lg37smPwqvAXl4bdDqhlw__162 img 1Lg37smPwqvAXl4bdDqhlw 162 4.3 3.338297 3.134115 2 False True
3 1Lg37smPwqvAXl4bdDqhlw__162 img 1Lg37smPwqvAXl4bdDqhlw 162 3.0 3.429386 2.368121 3 False True
4 1Lg37smPwqvAXl4bdDqhlw__162 img 1Lg37smPwqvAXl4bdDqhlw 162 1.8 2.581924 0.822017 4 False True