In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme()
In [2]:
DATA = Path('..') / 'data'
RAW = DATA / 'raw'
PROCESSED = DATA / 'processed'
Load data and utils¶
In [3]:
def pano_to_long(df):
pano_long = df.apply(pd.Series.explode)
pano_long['pov_index'] = pano_long.groupby('task_id').cumcount()
pano_long = pano_long.rename(columns={'headings': 'heading', 'scores': 'score'})
return pano_long
pano_scores = pd.read_json(PROCESSED / 'panorama_scores.json')
pano_scores_long = pano_to_long(pano_scores)
img_scores = pd.read_json(PROCESSED / 'image_scores.json')
runs = pd.read_json(PROCESSED / 'runs.json')
users = pd.read_json(PROCESSED / 'users.json')
users[['age_range', 'location', 'gender', 'level_of_education']] = users[['age_range', 'location', 'gender', 'level_of_education']].astype('category')
print('pano_scores:', len(pano_scores))
display(pano_scores.head(3))
print('pano_scores_long:', len(pano_scores_long))
display(pano_scores_long.head(2))
print('img_scores:', len(img_scores))
display(img_scores.head(3))
print('runs:', len(runs))
display(runs.head(3))
print('users:', len(users))
display(users.head(3))
pano_scores: 650
panoid | time_started | time_finished | scores | headings | familiarity | run_id | index_in_run | task_id | run_template_id | user_id | time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ToX0MGNWH-VsUEqq5wSCzQ | 2023-02-02T23:00:51.256000Z | 2023-02-02T23:02:15.021000Z | [2.0, 4.0, 1.5, 1.5] | [167, 257, 347, 77] | 1 | 84 | 3 | 668 | 10 | 75 | 83 |
1 | _zI03ND3kqbk0lyGr6va_A | 2023-02-02T23:02:40.071000Z | 2023-02-02T23:03:40.548000Z | [4.3, 3.5, 3.3, 3.7] | [313, 43, 133, 223] | 1 | 84 | 6 | 667 | 10 | 75 | 60 |
2 | vTjGNS9tAM-xTV0j85XRtA | 2023-03-14T14:42:07.762000Z | 2023-03-14T14:43:08.706000Z | [3.5, 3.5, 3.5, 3.5] | [352, 82, 172, 262] | 1 | 361 | 3 | 2886 | 3 | 143 | 60 |
pano_scores_long: 2600
panoid | time_started | time_finished | score | heading | familiarity | run_id | index_in_run | task_id | run_template_id | user_id | time | pov_index | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ToX0MGNWH-VsUEqq5wSCzQ | 2023-02-02T23:00:51.256000Z | 2023-02-02T23:02:15.021000Z | 2.0 | 167 | 1 | 84 | 3 | 668 | 10 | 75 | 83 | 0 |
0 | ToX0MGNWH-VsUEqq5wSCzQ | 2023-02-02T23:00:51.256000Z | 2023-02-02T23:02:15.021000Z | 4.0 | 257 | 1 | 84 | 3 | 668 | 10 | 75 | 83 | 1 |
img_scores: 1940
panoid | heading | time_started | time_finished | score | familiarity | run_id | index_in_run | task_id | run_template_id | user_id | time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | yGy5iCyoO0PbL2JaR9EggQ | 346 | 2023-02-02T23:00:23.057000Z | 2023-02-02T23:00:41.429000Z | 1.5 | 1 | 84 | 1 | 666 | 10 | 75 | 18 |
1 | Wv_rNo6f8bMLVKkHdZotfg | 142 | 2023-02-02T23:00:41.512000Z | 2023-02-02T23:00:51.156000Z | 4.5 | 1 | 84 | 2 | 671 | 10 | 75 | 9 |
2 | 1Lg37smPwqvAXl4bdDqhlw | 162 | 2023-02-02T23:02:15.165000Z | 2023-02-02T23:02:28.510000Z | 4.5 | 1 | 84 | 4 | 670 | 10 | 75 | 13 |
runs: 329
id | time_started | time_finished | run_template_id | user_id | img_task_ids | pano_task_ids | time | |
---|---|---|---|---|---|---|---|---|
0 | 84 | 2023-02-02T22:59:11.602033Z | 2023-02-02T23:03:44.914786Z | 10 | 75 | [666, 671, 670, 672, 665, 669] | [668, 667] | 273 |
1 | 361 | 2023-03-14T14:40:56.078633Z | 2023-03-14T14:45:08.342496Z | 3 | 143 | [2883, 2890, 2884, 2887, 2885, 2888] | [2886, 2889] | 252 |
2 | 91 | 2023-02-03T09:44:16.438030Z | 2023-02-03T09:49:42.938003Z | 10 | 82 | [721, 722, 728, 727, 725, 726] | [723, 724] | 326 |
users: 329
id | run_ids | age_range | gender | level_of_education | location | |
---|---|---|---|---|---|---|
0 | 39 | [43] | 25-29 | male | doctorate | FR |
1 | 20 | [23] | 25-29 | female | doctorate | FR |
2 | 70 | [79] | 15-19 | female | high school | FR |
In [4]:
PANOIDS_TASKS = list(pano_scores['panoid'].unique()) # all panoids except calibration
PANOIDS_CALIBRATION = list(set(img_scores['panoid'].unique()) - set(PANOIDS_TASKS))
PANOIDS_ALL = PANOIDS_TASKS + PANOIDS_CALIBRATION
print(len(PANOIDS_TASKS))
print(PANOIDS_CALIBRATION)
RUN_TEMPLATE_IDS = sorted(img_scores['run_template_id'].unique())
print(len(RUN_TEMPLATE_IDS))
IMG_TASK_PARAMS = list(img_scores[img_scores['panoid'].isin(PANOIDS_TASKS)].groupby(['panoid', 'heading']).size().index) # use groupby to get unique pairs of panoid, heading
print(len(IMG_TASK_PARAMS))
print(IMG_TASK_PARAMS)
CALIBRATION_TASK_PARAMS = [('yGy5iCyoO0PbL2JaR9EggQ', 346.0), ('TY6wOkPRysul_e_W73lmnQ', 224.0)]
24 ['yGy5iCyoO0PbL2JaR9EggQ', 'TY6wOkPRysul_e_W73lmnQ'] 12 48 [('1Lg37smPwqvAXl4bdDqhlw', 162), ('1Lg37smPwqvAXl4bdDqhlw', 342), ('GnReLERfph4NUc7ZCnYYIA', 30), ('GnReLERfph4NUc7ZCnYYIA', 210), ('J6b4uNaJXJePwh3i4g-J9g', 63), ('J6b4uNaJXJePwh3i4g-J9g', 243), ('OES3M70uJAKNvDX2lKkMPQ', 145), ('OES3M70uJAKNvDX2lKkMPQ', 325), ('P4F4QKFaTE5d-QNz_Jx0kg', 158), ('P4F4QKFaTE5d-QNz_Jx0kg', 248), ('PGxyIaP90yNpqgAOQylGog', 170), ('PGxyIaP90yNpqgAOQylGog', 350), ('SQRLJrD2KYe1d2VA6txyig', 11), ('SQRLJrD2KYe1d2VA6txyig', 191), ('ToX0MGNWH-VsUEqq5wSCzQ', 167), ('ToX0MGNWH-VsUEqq5wSCzQ', 347), ('UHt0RLnWk1TJnhkBTp6DeA', 103), ('UHt0RLnWk1TJnhkBTp6DeA', 283), ('Wv_rNo6f8bMLVKkHdZotfg', 142), ('Wv_rNo6f8bMLVKkHdZotfg', 322), ('_zI03ND3kqbk0lyGr6va_A', 43), ('_zI03ND3kqbk0lyGr6va_A', 223), ('aOT4Hl_n33HvBXyWpvYb4Q', 152), ('aOT4Hl_n33HvBXyWpvYb4Q', 332), ('dTb77iHE5hYvcqD26Y99TA', 98), ('dTb77iHE5hYvcqD26Y99TA', 278), ('iDUxUuJHoy4jOx-Yt8laNA', 58), ('iDUxUuJHoy4jOx-Yt8laNA', 238), ('iZ2ARYVKACAF8KFRIHr15w', 75), ('iZ2ARYVKACAF8KFRIHr15w', 165), ('jI40EDTDeCsmBibs1jbXzQ', 35), ('jI40EDTDeCsmBibs1jbXzQ', 215), ('m4kX2Djw5DmJbL40tel9Yw', 125), ('m4kX2Djw5DmJbL40tel9Yw', 305), ('pKtV8k7abhxUSE9JAkZLsA', 8), ('pKtV8k7abhxUSE9JAkZLsA', 188), ('qgpfKJzOZ5OBo5JdvCAp8Q', 152), ('qgpfKJzOZ5OBo5JdvCAp8Q', 332), ('tlPLzx1D7MRgcvFowbmWGw', 128), ('tlPLzx1D7MRgcvFowbmWGw', 308), ('v9YEYuKKwMPo3RZWKewZEQ', 0), ('v9YEYuKKwMPo3RZWKewZEQ', 90), ('vTjGNS9tAM-xTV0j85XRtA', 172), ('vTjGNS9tAM-xTV0j85XRtA', 352), ('x_gO8pWHTNMwQxHg9Xv5Sg', 86), ('x_gO8pWHTNMwQxHg9Xv5Sg', 266), ('ySFr8WwsE0Y1vkN1nZ19Rw', 164), ('ySFr8WwsE0Y1vkN1nZ19Rw', 344)]
In [5]:
def get_pano_img_scores(panoid, heading):
pano_scores_filtered = pano_scores_long[(pano_scores_long['panoid']==panoid) & (pano_scores_long['heading']==heading)]['score']
img_scores_filtered = img_scores[(img_scores['panoid']==panoid) & (img_scores['heading']==heading)]['score']
return pano_scores_filtered, img_scores_filtered
Format data¶
Make a df with columns user_id, pov_id ({panoid}-{heading}), panoid, task_type, score
- task_type=pano: We already have these columns in pano_scores_long
- task_type=image: We already have these columns in img_scores.
We just need to extract the relevant data and concatenate, while adding the column task_type
In [6]:
img_scores['run_template_id'].value_counts().sort_index()
Out[6]:
run_template_id 1 223 2 204 3 201 10 126 11 208 12 164 13 144 14 159 15 136 16 124 17 131 18 120 Name: count, dtype: int64
In [7]:
# Extract relevant columns and add task_type
pano_scores_long['task_type'] = 'pano'
img_scores['task_type'] = 'image'
pano_scores_long['is_calibration'] = False
img_scores['is_calibration'] = img_scores['panoid'].isin(PANOIDS_CALIBRATION)
pano_scores_long['heading'] = pano_scores_long['heading'].astype(int)
img_scores['heading'] = img_scores['heading'].astype(int)
# add column pov_id to both df
pano_scores_long['pov_id'] = pano_scores_long['panoid'].astype(str) + '-' + pano_scores_long['heading'].astype(str)
img_scores['pov_id'] = img_scores['panoid'].astype(str) + '-' + img_scores['heading'].astype(str)
# add a boolean column has_counterpart if the pov_id has a counterpart in the other df
pano_scores_long['has_counterpart'] = pano_scores_long['pov_id'].isin(img_scores['pov_id'])
img_scores['has_counterpart'] = img_scores['pov_id'].isin(pano_scores_long['pov_id'])
playlist_to_pano_group_mapping = {
1: 'A',
2: 'A',
3: 'A',
10: 'B',
11: 'B',
12: 'B',
13: 'C',
14: 'C',
15: 'C',
16: 'D',
17: 'D',
18: 'D'
}
# extract relevant columns and concat
columns = ['user_id', 'pov_id', 'panoid', 'heading', 'task_type', 'has_counterpart', 'is_calibration', 'run_template_id', 'score']
final_df = pd.concat([pano_scores_long[columns], img_scores[columns]], ignore_index=True)
final_df.rename(columns={'run_template_id': 'playlist_id'}, inplace=True)
final_df['panorama_group'] = final_df['playlist_id'].map(playlist_to_pano_group_mapping)
final_df.head()
Out[7]:
user_id | pov_id | panoid | heading | task_type | has_counterpart | is_calibration | playlist_id | score | panorama_group | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 75 | ToX0MGNWH-VsUEqq5wSCzQ-167 | ToX0MGNWH-VsUEqq5wSCzQ | 167 | pano | True | False | 10 | 2.0 | B |
1 | 75 | ToX0MGNWH-VsUEqq5wSCzQ-257 | ToX0MGNWH-VsUEqq5wSCzQ | 257 | pano | False | False | 10 | 4.0 | B |
2 | 75 | ToX0MGNWH-VsUEqq5wSCzQ-347 | ToX0MGNWH-VsUEqq5wSCzQ | 347 | pano | True | False | 10 | 1.5 | B |
3 | 75 | ToX0MGNWH-VsUEqq5wSCzQ-77 | ToX0MGNWH-VsUEqq5wSCzQ | 77 | pano | False | False | 10 | 1.5 | B |
4 | 75 | _zI03ND3kqbk0lyGr6va_A-313 | _zI03ND3kqbk0lyGr6va_A | 313 | pano | False | False | 10 | 4.3 | B |
In [8]:
final_df.to_csv(PROCESSED / 'R_dataframe.csv', index=False)
MOS¶
In [9]:
mos = pd.read_json(PROCESSED / 'mos.json')
mos['task_str'] = mos['task_str'].apply(lambda x: x.replace('img__', '').replace('pano__', ''))
mos.head()
Out[9]:
zrec_mean | zrec_ci95 | ambiguity | mean | ci95 | task_str | task_type | panoid | heading | scores | unbiased_scores | zrec_weights | n_scores | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.639504 | 0.461978 | 1.192592 | 3.665000 | 0.536245 | 1Lg37smPwqvAXl4bdDqhlw__162 | img | 1Lg37smPwqvAXl4bdDqhlw | 162 | [4.5, 3.7, 4.3, 3.0, 1.8, 4.5, 1.8, 4.5, 4.0, ... | [4.5269848954, 2.2755177064, 3.3382972997, 3.4... | [1.6559975051, 2.6094936981, 3.1341151709, 2.3... | 20 |
1 | 3.214081 | 0.223306 | 0.971571 | 3.144444 | 0.373453 | 1Lg37smPwqvAXl4bdDqhlw__342 | img | 1Lg37smPwqvAXl4bdDqhlw | 342 | [1.5, 3.0, 4.0, 4.7, 3.7, 2.5, 3.1, 4.0, 3.5, ... | [1.8499639157, 3.0755039274, 3.550159749, 3.91... | [0.9706302555, 0.7109065127, 4.0378420193, 2.8... | 27 |
2 | 2.987567 | 0.294954 | 0.982113 | 3.200000 | 0.420049 | GnReLERfph4NUc7ZCnYYIA__30 | img | GnReLERfph4NUc7ZCnYYIA | 30 | [2.2, 3.9, 5.0, 3.0, 4.0, 3.0, 1.4, 2.7, 3.0, ... | [2.3192773176, 3.3940297921, 3.7309587511, 3.7... | [1.413609054, 0.8850909961000001, 2.2580586099... | 22 |
3 | 3.432103 | 0.232188 | 0.749119 | 3.474074 | 0.287947 | GnReLERfph4NUc7ZCnYYIA__210 | img | GnReLERfph4NUc7ZCnYYIA | 210 | [3.3, 3.6, 4.0, 2.0, 3.5, 2.4, 3.5, 3.5, 3.2, ... | [2.3220183982, 3.3413230723, 3.9997644147, 2.3... | [1.9876567429, 1.3418638823, 2.1868066374, 1.1... | 27 |
4 | 3.637987 | 0.215803 | 1.019651 | 3.496970 | 0.353284 | J6b4uNaJXJePwh3i4g-J9g__63 | img | J6b4uNaJXJePwh3i4g-J9g | 63 | [3.0, 5.0, 4.0, 3.0, 4.3, 4.5, 3.5, 3.6, 2.0, ... | [3.2990290282, 4.4552737761, 4.292272107, 3.90... | [7.8503033188, 1.1137259141, 2.3090197319, 3.2... | 33 |
In [10]:
def expand_mos_df(df):
# Select required columns
cols_to_keep = ['task_str', 'task_type', 'panoid', 'heading',
'scores', 'unbiased_scores', 'zrec_weights']
# Create expanded dataframe
expanded_df = df[cols_to_keep].copy()
# Explode array columns simultaneously
expanded_df = expanded_df.apply(lambda x: x.explode() if isinstance(x.iloc[0], (list, np.ndarray)) else x)
# Add score_idx column
expanded_df['score_idx'] = expanded_df.groupby(['task_str', 'task_type', 'panoid', 'heading']).cumcount()
# Reset index
expanded_df = expanded_df.reset_index(drop=True)
return expanded_df
# Create new DataFrame
new_df = expand_mos_df(mos)
new_df = new_df.rename(columns={'scores': 'score', 'unbiased_scores': 'unbiased_score', 'zrec_weights': 'zrec_weight', 'task_str': 'pov_id'})
new_df['is_calibration'] = new_df['panoid'].isin(PANOIDS_CALIBRATION)
IMG_TASK_PARAMS_str = [f'{panoid}__{heading}' for panoid, heading in IMG_TASK_PARAMS]
new_df['has_counterpart'] = new_df['pov_id'].isin(IMG_TASK_PARAMS_str)
display(new_df.head())
new_df.to_csv(PROCESSED / 'R_mos_dataframe.csv', index=False)
pov_id | task_type | panoid | heading | score | unbiased_score | zrec_weight | score_idx | is_calibration | has_counterpart | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1Lg37smPwqvAXl4bdDqhlw__162 | img | 1Lg37smPwqvAXl4bdDqhlw | 162 | 4.5 | 4.526985 | 1.655998 | 0 | False | True |
1 | 1Lg37smPwqvAXl4bdDqhlw__162 | img | 1Lg37smPwqvAXl4bdDqhlw | 162 | 3.7 | 2.275518 | 2.609494 | 1 | False | True |
2 | 1Lg37smPwqvAXl4bdDqhlw__162 | img | 1Lg37smPwqvAXl4bdDqhlw | 162 | 4.3 | 3.338297 | 3.134115 | 2 | False | True |
3 | 1Lg37smPwqvAXl4bdDqhlw__162 | img | 1Lg37smPwqvAXl4bdDqhlw | 162 | 3.0 | 3.429386 | 2.368121 | 3 | False | True |
4 | 1Lg37smPwqvAXl4bdDqhlw__162 | img | 1Lg37smPwqvAXl4bdDqhlw | 162 | 1.8 | 2.581924 | 0.822017 | 4 | False | True |