Setup¶

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import scipy
import seaborn as sns

sns.set_theme()
In [2]:
DATA = Path('..') / 'data'
PROCESSED = DATA / 'processed'

Load data and utils¶

In [3]:
def pano_to_long(df):
    pano_long = df.apply(pd.Series.explode)
    pano_long['pov_index'] = pano_long.groupby('task_id').cumcount()
    pano_long = pano_long.rename(columns={'headings': 'heading', 'scores': 'score'})
    return pano_long
In [4]:
pano_scores = pd.read_json(PROCESSED / 'panorama_scores.json')
pano_scores_long = pano_to_long(pano_scores)
img_scores = pd.read_json(PROCESSED / 'image_scores.json')
runs = pd.read_json(PROCESSED / 'runs.json')
users = pd.read_json(PROCESSED / 'users.json')

print('pano_scores:', len(pano_scores))
display(pano_scores.head(3))
print('pano_scores_long:', len(pano_scores_long))
display(pano_scores_long.head(2))
print('img_scores:', len(img_scores))
display(img_scores.head(3))
print('runs:', len(runs))
display(runs.head(3))
print('users:', len(users))
display(users.head(3))
pano_scores: 650
panoid time_started time_finished scores headings familiarity run_id index_in_run task_id run_template_id user_id time
0 ToX0MGNWH-VsUEqq5wSCzQ 2023-02-02T23:00:51.256000Z 2023-02-02T23:02:15.021000Z [2.0, 4.0, 1.5, 1.5] [167, 257, 347, 77] 1 84 3 668 10 75 83
1 _zI03ND3kqbk0lyGr6va_A 2023-02-02T23:02:40.071000Z 2023-02-02T23:03:40.548000Z [4.3, 3.5, 3.3, 3.7] [313, 43, 133, 223] 1 84 6 667 10 75 60
2 vTjGNS9tAM-xTV0j85XRtA 2023-03-14T14:42:07.762000Z 2023-03-14T14:43:08.706000Z [3.5, 3.5, 3.5, 3.5] [352, 82, 172, 262] 1 361 3 2886 3 143 60
pano_scores_long: 2600
panoid time_started time_finished score heading familiarity run_id index_in_run task_id run_template_id user_id time pov_index
0 ToX0MGNWH-VsUEqq5wSCzQ 2023-02-02T23:00:51.256000Z 2023-02-02T23:02:15.021000Z 2.0 167 1 84 3 668 10 75 83 0
0 ToX0MGNWH-VsUEqq5wSCzQ 2023-02-02T23:00:51.256000Z 2023-02-02T23:02:15.021000Z 4.0 257 1 84 3 668 10 75 83 1
img_scores: 1940
panoid heading time_started time_finished score familiarity run_id index_in_run task_id run_template_id user_id time
0 yGy5iCyoO0PbL2JaR9EggQ 346 2023-02-02T23:00:23.057000Z 2023-02-02T23:00:41.429000Z 1.5 1 84 1 666 10 75 18
1 Wv_rNo6f8bMLVKkHdZotfg 142 2023-02-02T23:00:41.512000Z 2023-02-02T23:00:51.156000Z 4.5 1 84 2 671 10 75 9
2 1Lg37smPwqvAXl4bdDqhlw 162 2023-02-02T23:02:15.165000Z 2023-02-02T23:02:28.510000Z 4.5 1 84 4 670 10 75 13
runs: 329
id time_started time_finished run_template_id user_id img_task_ids pano_task_ids time
0 84 2023-02-02T22:59:11.602033Z 2023-02-02T23:03:44.914786Z 10 75 [666, 671, 670, 672, 665, 669] [668, 667] 273
1 361 2023-03-14T14:40:56.078633Z 2023-03-14T14:45:08.342496Z 3 143 [2883, 2890, 2884, 2887, 2885, 2888] [2886, 2889] 252
2 91 2023-02-03T09:44:16.438030Z 2023-02-03T09:49:42.938003Z 10 82 [721, 722, 728, 727, 725, 726] [723, 724] 326
users: 329
id run_ids age_range gender level_of_education location
0 39 [43] 25-29 male doctorate FR
1 20 [23] 25-29 female doctorate FR
2 70 [79] 15-19 female high school FR

Define some constants used later:

In [5]:
PANOIDS_TASKS = list(pano_scores['panoid'].unique())  # all panoids except calibration
PANOIDS_CALIBRATION = list(set(img_scores['panoid'].unique()) - set(PANOIDS_TASKS))
PANOIDS_ALL = PANOIDS_TASKS + PANOIDS_CALIBRATION
print(len(PANOIDS_TASKS))
print(PANOIDS_CALIBRATION)

RUN_TEMPLATE_IDS = sorted(img_scores['run_template_id'].unique())
print(len(RUN_TEMPLATE_IDS))

IMG_TASK_PARAMS = list(img_scores[img_scores['panoid'].isin(PANOIDS_TASKS)].groupby(['panoid', 'heading']).size().index)  # use groupby to get unique pairs of panoid, heading
print(len(IMG_TASK_PARAMS))
print(IMG_TASK_PARAMS)

PANO_TASK_PARAMS = pano_scores_long[['panoid', 'heading']].drop_duplicates().sort_values('panoid').values.tolist()
print(len(PANO_TASK_PARAMS))
24
['yGy5iCyoO0PbL2JaR9EggQ', 'TY6wOkPRysul_e_W73lmnQ']
12
48
[('1Lg37smPwqvAXl4bdDqhlw', 162), ('1Lg37smPwqvAXl4bdDqhlw', 342), ('GnReLERfph4NUc7ZCnYYIA', 30), ('GnReLERfph4NUc7ZCnYYIA', 210), ('J6b4uNaJXJePwh3i4g-J9g', 63), ('J6b4uNaJXJePwh3i4g-J9g', 243), ('OES3M70uJAKNvDX2lKkMPQ', 145), ('OES3M70uJAKNvDX2lKkMPQ', 325), ('P4F4QKFaTE5d-QNz_Jx0kg', 158), ('P4F4QKFaTE5d-QNz_Jx0kg', 248), ('PGxyIaP90yNpqgAOQylGog', 170), ('PGxyIaP90yNpqgAOQylGog', 350), ('SQRLJrD2KYe1d2VA6txyig', 11), ('SQRLJrD2KYe1d2VA6txyig', 191), ('ToX0MGNWH-VsUEqq5wSCzQ', 167), ('ToX0MGNWH-VsUEqq5wSCzQ', 347), ('UHt0RLnWk1TJnhkBTp6DeA', 103), ('UHt0RLnWk1TJnhkBTp6DeA', 283), ('Wv_rNo6f8bMLVKkHdZotfg', 142), ('Wv_rNo6f8bMLVKkHdZotfg', 322), ('_zI03ND3kqbk0lyGr6va_A', 43), ('_zI03ND3kqbk0lyGr6va_A', 223), ('aOT4Hl_n33HvBXyWpvYb4Q', 152), ('aOT4Hl_n33HvBXyWpvYb4Q', 332), ('dTb77iHE5hYvcqD26Y99TA', 98), ('dTb77iHE5hYvcqD26Y99TA', 278), ('iDUxUuJHoy4jOx-Yt8laNA', 58), ('iDUxUuJHoy4jOx-Yt8laNA', 238), ('iZ2ARYVKACAF8KFRIHr15w', 75), ('iZ2ARYVKACAF8KFRIHr15w', 165), ('jI40EDTDeCsmBibs1jbXzQ', 35), ('jI40EDTDeCsmBibs1jbXzQ', 215), ('m4kX2Djw5DmJbL40tel9Yw', 125), ('m4kX2Djw5DmJbL40tel9Yw', 305), ('pKtV8k7abhxUSE9JAkZLsA', 8), ('pKtV8k7abhxUSE9JAkZLsA', 188), ('qgpfKJzOZ5OBo5JdvCAp8Q', 152), ('qgpfKJzOZ5OBo5JdvCAp8Q', 332), ('tlPLzx1D7MRgcvFowbmWGw', 128), ('tlPLzx1D7MRgcvFowbmWGw', 308), ('v9YEYuKKwMPo3RZWKewZEQ', 0), ('v9YEYuKKwMPo3RZWKewZEQ', 90), ('vTjGNS9tAM-xTV0j85XRtA', 172), ('vTjGNS9tAM-xTV0j85XRtA', 352), ('x_gO8pWHTNMwQxHg9Xv5Sg', 86), ('x_gO8pWHTNMwQxHg9Xv5Sg', 266), ('ySFr8WwsE0Y1vkN1nZ19Rw', 164), ('ySFr8WwsE0Y1vkN1nZ19Rw', 344)]
96
In [6]:
CALIBRATION_TASK_PARAMS = [('yGy5iCyoO0PbL2JaR9EggQ', 346.0), ('TY6wOkPRysul_e_W73lmnQ', 224.0)]
In [7]:
def get_pano_img_scores(panoid, heading):
    pano_scores = pano_scores_long[(pano_scores_long['panoid']==panoid) & (pano_scores_long['heading']==heading)]['score']
    img_scores = img_scores[(img_scores['panoid']==panoid) & (img_scores['heading']==heading)]['score']
    return pano_scores, img_scores

Transform Dataframes¶

Create a Dataframe for the MOS recovery process with ZREC¶

We need to create an array of shape (n_participants, n_stimuli). Since ZREC handles NaNs, we put all the scores in one array with 14 (nb of tasks per run) * 12 (number of different playlists) = 168 columns and n_participants rows, filling the rest with NaNs.

In [8]:
def get_task_params(run_template_id):
    img_task_params = img_scores[img_scores['run_template_id']==run_template_id][['panoid', 'heading']].drop_duplicates().values.tolist()
    pano_task_panoids = pano_scores[pano_scores['run_template_id']==run_template_id]['panoid'].unique()
    pano_task_params = {}
    for panoid in pano_task_panoids:
        pano_task_params[panoid] = eval(pano_scores[(pano_scores['panoid']==panoid) & (pano_scores['run_template_id']==run_template_id)]['headings'].iloc[0])
    return img_task_params, pano_task_params

def get_img_scores(img_task_params, run_id):
    scores = []
    for panoid, heading in img_task_params:
        score = img_scores[(img_scores['panoid']==panoid) & (img_scores['heading']==heading) & (img_scores['run_id']==run_id)]['score'].iloc[0]
        scores.append(score)
    return scores

def get_pano_scores(pano_task_params, run_id):
    scores = []
    p = pano_scores_long
    for panoid, headings in pano_task_params.items():
        for heading in headings:
            score = p[(p['panoid']==panoid) & (p['heading']==heading) & (p['run_id']==run_id)]['score'].iloc[0]
            scores.append(score)
    return scores

def get_column_names(img_task_params, pano_task_params, run_template_id):
    img_cols = ['{}__{}__img'.format(panoid, heading) if panoid not in PANOIDS_CALIBRATION else '{}__{}__{}__img'.format(panoid, heading, run_template_id) for panoid, heading in img_task_params]
    pano_cols = ['{}__{}__pano'.format(panoid, heading) for panoid, headings in pano_task_params.items() for heading in headings]
    return img_cols + pano_cols
In [9]:
# First, identify all 168 columns: task_type, panoid, heading
def make_task_params(task_type, task_params):
    return [(task_type, panoid, heading) for panoid, heading in task_params]

TASKS_COMBINED = make_task_params('img', IMG_TASK_PARAMS) + make_task_params('pano', PANO_TASK_PARAMS) + make_task_params('img', CALIBRATION_TASK_PARAMS)
zrec_data = np.full((len(runs), len(TASKS_COMBINED)), np.nan)

for row_idx, row in runs.iterrows():
    for col_idx, task in enumerate(TASKS_COMBINED):
        task_type, panoid, heading = task
        score = np.nan
        if task_type == 'img':
            data = img_scores[(img_scores['panoid']==panoid) & (img_scores['heading']==heading) & (img_scores['run_id']==row['id'])]['score']
            if len(data) > 0:
                score = data.iloc[0]
        elif task_type == 'pano':
            data = pano_scores_long[(pano_scores_long['panoid']==panoid) & (pano_scores_long['heading']==heading) & (pano_scores_long['run_id']==row['id'])]['score']
            if len(data) > 0:
                score = data.iloc[0]
        zrec_data[row_idx, col_idx] = score

TASKS_COMBINED_STR = ['{}__{}__{}'.format(task_type, panoid, heading) for task_type, panoid, heading in TASKS_COMBINED]
zrec_data = pd.DataFrame(zrec_data, columns=TASKS_COMBINED_STR)
zrec_data.head()
Out[9]:
img__1Lg37smPwqvAXl4bdDqhlw__162 img__1Lg37smPwqvAXl4bdDqhlw__342 img__GnReLERfph4NUc7ZCnYYIA__30 img__GnReLERfph4NUc7ZCnYYIA__210 img__J6b4uNaJXJePwh3i4g-J9g__63 img__J6b4uNaJXJePwh3i4g-J9g__243 img__OES3M70uJAKNvDX2lKkMPQ__145 img__OES3M70uJAKNvDX2lKkMPQ__325 img__P4F4QKFaTE5d-QNz_Jx0kg__158 img__P4F4QKFaTE5d-QNz_Jx0kg__248 ... pano__x_gO8pWHTNMwQxHg9Xv5Sg__356 pano__x_gO8pWHTNMwQxHg9Xv5Sg__86 pano__x_gO8pWHTNMwQxHg9Xv5Sg__266 pano__x_gO8pWHTNMwQxHg9Xv5Sg__176 pano__ySFr8WwsE0Y1vkN1nZ19Rw__344 pano__ySFr8WwsE0Y1vkN1nZ19Rw__74 pano__ySFr8WwsE0Y1vkN1nZ19Rw__164 pano__ySFr8WwsE0Y1vkN1nZ19Rw__254 img__yGy5iCyoO0PbL2JaR9EggQ__346.0 img__TY6wOkPRysul_e_W73lmnQ__224.0
0 4.5 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 1.5 4.5
1 NaN NaN NaN NaN 3.0 NaN NaN NaN 3.5 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 1.5 4.0
2 3.7 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 4.1 5.0
3 4.3 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 1.5 5.0
4 NaN NaN NaN NaN 5.0 NaN NaN NaN 4.0 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN 1.5 1.5

5 rows × 146 columns

Apply ZREC¶

In [10]:
from ZREC_Mos_Recovery import zrec_mos_recovery, plot_subject_inconsistency_bias
In [11]:
def mean_confidence_interval(data, confidence=0.95):
    """
    Return the mean associated CI for each column of the data, at the specified confidence level.
    Wrapper of `mean_ci`.
    """
    means, cis = [], []
    for j in range(data.shape[1]):
        d = data[:, j]
        m, err = mean_ci(d, confidence)
        means.append(m)
        cis.append(err)  # ci: [m-err, m+err]
    return np.array(means), np.array(cis)

def mean_ci(sample, confidence):
    """Return the mean and associated CI of the sample, at the specified confidence level."""
    n = len(sample)
    m, sem = np.nanmean(sample), scipy.stats.sem(sample, nan_policy='omit')  # se = np.std(a, ddof=1) / np.sqrt(n)
    if n < 30:
        critical_value = scipy.stats.t.ppf((1 + confidence) / 2, df=n-1)
    else:
        critical_value = scipy.stats.norm.ppf(0.975)
    err = sem * critical_value
    return m, err

Apply the ZREC method:

In [12]:
zrecmos, zrecmos_95ci, subject_inconsistency, subject_bias_factor, content_ambiguity, unbiased, weights = zrec_mos_recovery(zrec_data.to_numpy())

means, cis = mean_confidence_interval(zrec_data.to_numpy())

Create the mos dataframe that holds the Recovered MOS from the ZREC methods, as well as some other data useful for the statistical tests.

In [13]:
TARGET_COLUMNS = ['panoid',	'heading', 'task_type',	'zrec_mean', 'zrec_ci95', 'mean', 'ci95', 'unbiased_scores', 'zrec_weights', 'scores', 'ambiguity']

def get_task_mask(task_str):
    """Return a boolean mask to filter the scores that belong to this task, out of the scores of the whole experiment"""
    mask = zrec_data[task_str].isna()
    mask = ~mask
    return mask

def get_data(task_idx, task_str):
    """Return the zrec scores for this task"""
    mask = get_task_mask(task_str)
    scores = zrec_data[task_str][mask].to_numpy()
    scores_unbiased = unbiased[mask, task_idx]
    w = weights[mask]
    return scores, scores_unbiased, w

data = np.stack((zrecmos, zrecmos_95ci, content_ambiguity, means, cis), axis=1)
mos = pd.DataFrame(data, columns=['zrec_mean', 'zrec_ci95', 'ambiguity', 'mean', 'ci95'])
mos['task_str'] = TASKS_COMBINED_STR
mos[[ 'task_type', 'panoid', 'heading']] = mos['task_str'].str.split('__', expand=True)
mos[['scores', 'unbiased_scores', 'zrec_weights']] = mos.apply(lambda x: get_data(x.name, x['task_str']), axis=1, result_type='expand')
mos['n_scores'] = mos['unbiased_scores'].apply(len)
mos.head()
Out[13]:
zrec_mean zrec_ci95 ambiguity mean ci95 task_str task_type panoid heading scores unbiased_scores zrec_weights n_scores
0 3.639504 0.461978 1.192592 3.665000 0.536245 img__1Lg37smPwqvAXl4bdDqhlw__162 img 1Lg37smPwqvAXl4bdDqhlw 162 [4.5, 3.7, 4.3, 3.0, 1.8, 4.5, 1.8, 4.5, 4.0, ... [4.526984895425468, 2.275517706406146, 3.33829... [1.6559975051207856, 2.6094936981201253, 3.134... 20
1 3.214081 0.223306 0.971571 3.144444 0.373453 img__1Lg37smPwqvAXl4bdDqhlw__342 img 1Lg37smPwqvAXl4bdDqhlw 342 [1.5, 3.0, 4.0, 4.7, 3.7, 2.5, 3.1, 4.0, 3.5, ... [1.8499639156614003, 3.0755039274116753, 3.550... [0.970630255541091, 0.7109065126885892, 4.0378... 27
2 2.987567 0.294954 0.982113 3.200000 0.420049 img__GnReLERfph4NUc7ZCnYYIA__30 img GnReLERfph4NUc7ZCnYYIA 30 [2.2, 3.9, 5.0, 3.0, 4.0, 3.0, 1.4, 2.7, 3.0, ... [2.319277317640751, 3.394029792131038, 3.73095... [1.4136090540448125, 0.8850909961208744, 2.258... 22
3 3.432103 0.232188 0.749119 3.474074 0.287947 img__GnReLERfph4NUc7ZCnYYIA__210 img GnReLERfph4NUc7ZCnYYIA 210 [3.3, 3.6, 4.0, 2.0, 3.5, 2.4, 3.5, 3.5, 3.2, ... [2.322018398203028, 3.341323072300512, 3.99976... [1.9876567428685794, 1.3418638823051372, 2.186... 27
4 3.637987 0.215803 1.019651 3.496970 0.353284 img__J6b4uNaJXJePwh3i4g-J9g__63 img J6b4uNaJXJePwh3i4g-J9g 63 [3.0, 5.0, 4.0, 3.0, 4.3, 4.5, 3.5, 3.6, 2.0, ... [3.299029028200214, 4.45527377607297, 4.292272... [7.850303318750475, 1.1137259140920244, 2.3090... 33

Quick check that everything works when calculating the MOS and Recovered MOS by hand from the scores, unbiased_scores and zrec_weights:

In [14]:
for idx in (0, 1, 2, 98, 99):
    print('idx:', idx)
    row = mos.iloc[idx]

    print('ZREC MOS: {} -- {}'.format(row['zrec_mean'], np.average(row['unbiased_scores'], weights=row['zrec_weights'])))  # check the formula
    print('Regular MOS: {} -- {}'.format(row['mean'], np.average(row['scores'])))
idx: 0
ZREC MOS: 3.6395039525041413 -- 3.6395039525041395
Regular MOS: 3.6650000000000005 -- 3.665
idx: 1
ZREC MOS: 3.2140813841623856 -- 3.2140813841623856
Regular MOS: 3.144444444444445 -- 3.144444444444444
idx: 2
ZREC MOS: 2.9875670209338767 -- 2.9875670209338754
Regular MOS: 3.2 -- 3.2
idx: 98
ZREC MOS: 3.6818145591995877 -- 3.681814559199587
Regular MOS: 3.755555555555556 -- 3.755555555555556
idx: 99
ZREC MOS: 3.7420230303909485 -- 3.7420230303909485
Regular MOS: 3.7777777777777777 -- 3.7777777777777777
In [15]:
plot_subject_inconsistency_bias(subject_inconsistency, subject_bias_factor);
No description has been provided for this image
In [16]:
sns.histplot(subject_bias_factor, bins=20, kde=True);
No description has been provided for this image
In [17]:
sns.histplot(subject_inconsistency, bins=20, kde=True);
No description has been provided for this image

Export MOS¶

In [18]:
zrec_data.to_csv(PROCESSED / 'zrec_data.csv', index=False)
In [19]:
mos.to_json(PROCESSED / 'mos.json', orient='records')