Setup¶
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.stats.weightstats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from pp2_preds_postprocessor import PP2PredsPostprocessor
sns.set_theme()
DATA = Path('..') / 'data'
RAW = DATA / 'raw'
PROCESSED = DATA / 'processed'
FIGURES_PAPER = Path('..') / 'figures'
FIGURES_PAPER.mkdir(exist_ok=True, parents=True)
EXPORT_FIGURES = True
def export_figure_paper(fig, title, dpi=400, bbox_inches='tight', **params):
if EXPORT_FIGURES:
fig.savefig(FIGURES_PAPER / title, dpi=dpi, bbox_inches=bbox_inches, **params)
Load data and utils¶
General¶
def pano_to_long(df):
pano_long = df.apply(pd.Series.explode)
pano_long['pov_index'] = pano_long.groupby('task_id').cumcount()
pano_long = pano_long.rename(columns={'headings': 'heading', 'scores': 'score'})
return pano_long
pano_scores = pd.read_json(PROCESSED / 'panorama_scores.json')
pano_scores_long = pano_to_long(pano_scores)
img_scores = pd.read_json(PROCESSED / 'image_scores.json')
runs = pd.read_json(PROCESSED / 'runs.json')
users = pd.read_json(PROCESSED / 'users.json')
users[['age_range', 'location', 'gender', 'level_of_education']] = users[['age_range', 'location', 'gender', 'level_of_education']].astype('category')
print('pano_scores:', len(pano_scores))
display(pano_scores.head(3))
print('pano_scores_long:', len(pano_scores_long))
display(pano_scores_long.head(2))
print('img_scores:', len(img_scores))
display(img_scores.head(3))
print('runs:', len(runs))
display(runs.head(3))
print('users:', len(users))
display(users.head(3))
pano_scores: 650
panoid | time_started | time_finished | scores | headings | familiarity | run_id | index_in_run | task_id | run_template_id | user_id | time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ToX0MGNWH-VsUEqq5wSCzQ | 2023-02-02T23:00:51.256000Z | 2023-02-02T23:02:15.021000Z | [2.0, 4.0, 1.5, 1.5] | [167, 257, 347, 77] | 1 | 84 | 3 | 668 | 10 | 75 | 83 |
1 | _zI03ND3kqbk0lyGr6va_A | 2023-02-02T23:02:40.071000Z | 2023-02-02T23:03:40.548000Z | [4.3, 3.5, 3.3, 3.7] | [313, 43, 133, 223] | 1 | 84 | 6 | 667 | 10 | 75 | 60 |
2 | vTjGNS9tAM-xTV0j85XRtA | 2023-03-14T14:42:07.762000Z | 2023-03-14T14:43:08.706000Z | [3.5, 3.5, 3.5, 3.5] | [352, 82, 172, 262] | 1 | 361 | 3 | 2886 | 3 | 143 | 60 |
pano_scores_long: 2600
panoid | time_started | time_finished | score | heading | familiarity | run_id | index_in_run | task_id | run_template_id | user_id | time | pov_index | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ToX0MGNWH-VsUEqq5wSCzQ | 2023-02-02T23:00:51.256000Z | 2023-02-02T23:02:15.021000Z | 2.0 | 167 | 1 | 84 | 3 | 668 | 10 | 75 | 83 | 0 |
0 | ToX0MGNWH-VsUEqq5wSCzQ | 2023-02-02T23:00:51.256000Z | 2023-02-02T23:02:15.021000Z | 4.0 | 257 | 1 | 84 | 3 | 668 | 10 | 75 | 83 | 1 |
img_scores: 1940
panoid | heading | time_started | time_finished | score | familiarity | run_id | index_in_run | task_id | run_template_id | user_id | time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | yGy5iCyoO0PbL2JaR9EggQ | 346 | 2023-02-02T23:00:23.057000Z | 2023-02-02T23:00:41.429000Z | 1.5 | 1 | 84 | 1 | 666 | 10 | 75 | 18 |
1 | Wv_rNo6f8bMLVKkHdZotfg | 142 | 2023-02-02T23:00:41.512000Z | 2023-02-02T23:00:51.156000Z | 4.5 | 1 | 84 | 2 | 671 | 10 | 75 | 9 |
2 | 1Lg37smPwqvAXl4bdDqhlw | 162 | 2023-02-02T23:02:15.165000Z | 2023-02-02T23:02:28.510000Z | 4.5 | 1 | 84 | 4 | 670 | 10 | 75 | 13 |
runs: 329
id | time_started | time_finished | run_template_id | user_id | img_task_ids | pano_task_ids | time | |
---|---|---|---|---|---|---|---|---|
0 | 84 | 2023-02-02T22:59:11.602033Z | 2023-02-02T23:03:44.914786Z | 10 | 75 | [666, 671, 670, 672, 665, 669] | [668, 667] | 273 |
1 | 361 | 2023-03-14T14:40:56.078633Z | 2023-03-14T14:45:08.342496Z | 3 | 143 | [2883, 2890, 2884, 2887, 2885, 2888] | [2886, 2889] | 252 |
2 | 91 | 2023-02-03T09:44:16.438030Z | 2023-02-03T09:49:42.938003Z | 10 | 82 | [721, 722, 728, 727, 725, 726] | [723, 724] | 326 |
users: 329
id | run_ids | age_range | gender | level_of_education | location | |
---|---|---|---|---|---|---|
0 | 39 | [43] | 25-29 | male | doctorate | FR |
1 | 20 | [23] | 25-29 | female | doctorate | FR |
2 | 70 | [79] | 15-19 | female | high school | FR |
PANOIDS_TASKS = list(pano_scores['panoid'].unique()) # all panoids except calibration
PANOIDS_CALIBRATION = list(set(img_scores['panoid'].unique()) - set(PANOIDS_TASKS))
PANOIDS_ALL = PANOIDS_TASKS + PANOIDS_CALIBRATION
print(len(PANOIDS_TASKS))
print(PANOIDS_CALIBRATION)
RUN_TEMPLATE_IDS = sorted(img_scores['run_template_id'].unique())
print(len(RUN_TEMPLATE_IDS))
IMG_TASK_PARAMS = list(img_scores[img_scores['panoid'].isin(PANOIDS_TASKS)].groupby(['panoid', 'heading']).size().index) # use groupby to get unique pairs of panoid, heading
print(len(IMG_TASK_PARAMS))
print(IMG_TASK_PARAMS)
24 ['TY6wOkPRysul_e_W73lmnQ', 'yGy5iCyoO0PbL2JaR9EggQ'] 12 48 [('1Lg37smPwqvAXl4bdDqhlw', 162), ('1Lg37smPwqvAXl4bdDqhlw', 342), ('GnReLERfph4NUc7ZCnYYIA', 30), ('GnReLERfph4NUc7ZCnYYIA', 210), ('J6b4uNaJXJePwh3i4g-J9g', 63), ('J6b4uNaJXJePwh3i4g-J9g', 243), ('OES3M70uJAKNvDX2lKkMPQ', 145), ('OES3M70uJAKNvDX2lKkMPQ', 325), ('P4F4QKFaTE5d-QNz_Jx0kg', 158), ('P4F4QKFaTE5d-QNz_Jx0kg', 248), ('PGxyIaP90yNpqgAOQylGog', 170), ('PGxyIaP90yNpqgAOQylGog', 350), ('SQRLJrD2KYe1d2VA6txyig', 11), ('SQRLJrD2KYe1d2VA6txyig', 191), ('ToX0MGNWH-VsUEqq5wSCzQ', 167), ('ToX0MGNWH-VsUEqq5wSCzQ', 347), ('UHt0RLnWk1TJnhkBTp6DeA', 103), ('UHt0RLnWk1TJnhkBTp6DeA', 283), ('Wv_rNo6f8bMLVKkHdZotfg', 142), ('Wv_rNo6f8bMLVKkHdZotfg', 322), ('_zI03ND3kqbk0lyGr6va_A', 43), ('_zI03ND3kqbk0lyGr6va_A', 223), ('aOT4Hl_n33HvBXyWpvYb4Q', 152), ('aOT4Hl_n33HvBXyWpvYb4Q', 332), ('dTb77iHE5hYvcqD26Y99TA', 98), ('dTb77iHE5hYvcqD26Y99TA', 278), ('iDUxUuJHoy4jOx-Yt8laNA', 58), ('iDUxUuJHoy4jOx-Yt8laNA', 238), ('iZ2ARYVKACAF8KFRIHr15w', 75), ('iZ2ARYVKACAF8KFRIHr15w', 165), ('jI40EDTDeCsmBibs1jbXzQ', 35), ('jI40EDTDeCsmBibs1jbXzQ', 215), ('m4kX2Djw5DmJbL40tel9Yw', 125), ('m4kX2Djw5DmJbL40tel9Yw', 305), ('pKtV8k7abhxUSE9JAkZLsA', 8), ('pKtV8k7abhxUSE9JAkZLsA', 188), ('qgpfKJzOZ5OBo5JdvCAp8Q', 152), ('qgpfKJzOZ5OBo5JdvCAp8Q', 332), ('tlPLzx1D7MRgcvFowbmWGw', 128), ('tlPLzx1D7MRgcvFowbmWGw', 308), ('v9YEYuKKwMPo3RZWKewZEQ', 0), ('v9YEYuKKwMPo3RZWKewZEQ', 90), ('vTjGNS9tAM-xTV0j85XRtA', 172), ('vTjGNS9tAM-xTV0j85XRtA', 352), ('x_gO8pWHTNMwQxHg9Xv5Sg', 86), ('x_gO8pWHTNMwQxHg9Xv5Sg', 266), ('ySFr8WwsE0Y1vkN1nZ19Rw', 164), ('ySFr8WwsE0Y1vkN1nZ19Rw', 344)]
MOS¶
mos = pd.read_json(PROCESSED / 'mos.json')
display(mos.head(3))
zrec_mean | zrec_ci95 | ambiguity | mean | ci95 | task_str | task_type | panoid | heading | scores | unbiased_scores | zrec_weights | n_scores | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.639504 | 0.461978 | 1.192592 | 3.665000 | 0.536245 | img__1Lg37smPwqvAXl4bdDqhlw__162 | img | 1Lg37smPwqvAXl4bdDqhlw | 162 | [4.5, 3.7, 4.3, 3.0, 1.8, 4.5, 1.8, 4.5, 4.0, ... | [4.5269848954, 2.2755177064, 3.3382972997, 3.4... | [1.6559975051, 2.6094936981, 3.1341151709, 2.3... | 20 |
1 | 3.214081 | 0.223306 | 0.971571 | 3.144444 | 0.373453 | img__1Lg37smPwqvAXl4bdDqhlw__342 | img | 1Lg37smPwqvAXl4bdDqhlw | 342 | [1.5, 3.0, 4.0, 4.7, 3.7, 2.5, 3.1, 4.0, 3.5, ... | [1.8499639157, 3.0755039274, 3.550159749, 3.91... | [0.9706302555, 0.7109065127, 4.0378420193, 2.8... | 27 |
2 | 2.987567 | 0.294954 | 0.982113 | 3.200000 | 0.420049 | img__GnReLERfph4NUc7ZCnYYIA__30 | img | GnReLERfph4NUc7ZCnYYIA | 30 | [2.2, 3.9, 5.0, 3.0, 4.0, 3.0, 1.4, 2.7, 3.0, ... | [2.3192773176, 3.3940297921, 3.7309587511, 3.7... | [1.413609054, 0.8850909961000001, 2.2580586099... | 22 |
PP2 scores¶
PP2_RESULTS = RAW / 'pp2_safer.json' # predictions for the sames headings as in the experiment
def load_pp2_results(path):
df = pd.read_json(path)
# Create panoid, heading columns
df['panoid'] = df['img'].apply(lambda x: '_'.join(x.split('_')[:-1]))
df['heading'] = df['img'].apply(lambda x: int(x.split('_')[-1].rstrip('.jpg')))
# Filter rows, keep only the PANOIDS that we used in this experiment
df = df[df['panoid'].isin(PANOIDS_ALL)]
# Process scores
df = df.rename(columns={'scores': 'score'})
processor = PP2PredsPostprocessor()
df['score'] = processor.process(df, ['score'], outliers='clip')
df['score'] = df['score'] * 4 + 1 # adapt to the [1, 5] range used in the experiment
return df
pp2 = load_pp2_results(PP2_RESULTS)
print('pp2: {} rows'.format(len(pp2)))
display(pp2.head())
print('NaN:', pp2['score'].isna().sum())
pp2: 98 rows
img | score | panoid | heading | |
---|---|---|---|---|
0 | GnReLERfph4NUc7ZCnYYIA_300.jpg | 2.367921 | GnReLERfph4NUc7ZCnYYIA | 300 |
1 | OES3M70uJAKNvDX2lKkMPQ_145.jpg | 3.554820 | OES3M70uJAKNvDX2lKkMPQ | 145 |
2 | x_gO8pWHTNMwQxHg9Xv5Sg_266.jpg | 3.178799 | x_gO8pWHTNMwQxHg9Xv5Sg | 266 |
3 | J6b4uNaJXJePwh3i4g-J9g_63.jpg | 3.585738 | J6b4uNaJXJePwh3i4g-J9g | 63 |
4 | vTjGNS9tAM-xTV0j85XRtA_352.jpg | 3.622242 | vTjGNS9tAM-xTV0j85XRtA | 352 |
NaN: 0
Plotting function¶
def plot_regression_and_print_coefficients(data, x_col, y_col, title=None, ax=None, xlim=None, ylim=None):
ax_provided = ax is not None
# Plotting regression
ax = sns.regplot(x=x_col, y=y_col, data=data, ax=ax)
if xlim is not None:
ax.set_xlim(*xlim)
if ylim is not None:
ax.set_ylim(*ylim)
if title is not None:
ax.set_title(title)
if not ax_provided:
plt.show()
# Calculate linear regression coefficients and R^2
X = data[[x_col]]
y = data[y_col]
model = LinearRegression().fit(X, y)
r2 = r2_score(y, model.predict(X))
print(f"Slope: {model.coef_[0]:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print(f"R^2: {r2:.4f}")
print("------------------------")
return ax
Compare scores between img/pano tasks¶
Statistical tests (Weighted t-test)¶
Perform the weighted t-test between each pair of distributions. For each POV (determined by a panoid and a heading), we retrieve the scores obtained under the 1) panorama modality (with context) and 2) the image modality (without context). Then, we perform the statistical test between the two samples, and calculate the difference in Recovered MOS. The test is performed using the unbiased scores and the weights returned by the ZREC method, since the recovered MOS is calculated as the weighted average of the unbiased scores.
The results are stored in the DataFrame stats_img_vs_pano
.
weighted_t_test_p, weighted_t_test_significant, significance_level, img_pano_mos_diff, img_pano_mos_rel_diff = [], [], [], [], []
for panoid, heading in IMG_TASK_PARAMS:
data = mos[(mos['panoid']==panoid) & (mos['heading']==heading)]
img, pano = data[data['task_type']=='img'].iloc[0], data[data['task_type']=='pano'].iloc[0] # Series
# Different variance
tstat, pvalue, _ = statsmodels.stats.weightstats.ttest_ind(img['unbiased_scores'], pano['unbiased_scores'],
weights=(img['zrec_weights'], pano['zrec_weights']), usevar='unequal')
mos_diff = img['zrec_mean'] - pano['zrec_mean']
mos_rel_diff = np.abs(mos_diff) / img['zrec_mean'] # image as reference, so we can see the effect of 'adding context'
img_pano_mos_diff.append(mos_diff)
img_pano_mos_rel_diff.append(mos_rel_diff)
assert np.isclose(np.average(img['unbiased_scores'], weights=img['zrec_weights']), img['zrec_mean']), 'ZREC issue w/ img at task {} {}'.format(panoid, heading)
assert np.isclose(np.average(pano['unbiased_scores'], weights=pano['zrec_weights']), pano['zrec_mean']), 'ZREC issue w/ pano at task {} {}'.format(panoid, heading)
weighted_t_test_p.append(pvalue)
weighted_t_test_significant.append(pvalue < 0.05)
if pvalue < 0.001:
significance_level.append('***')
elif pvalue < 0.01:
significance_level.append('**')
elif pvalue < 0.05:
significance_level.append('*')
else:
significance_level.append('')
stats_img_vs_pano = pd.DataFrame(IMG_TASK_PARAMS, columns=['panoid', 'heading'])
stats_img_vs_pano['weighted_t_test_p'] = weighted_t_test_p
stats_img_vs_pano['weighted_t_test_significant'] = weighted_t_test_significant
stats_img_vs_pano['significance_level'] = significance_level
stats_img_vs_pano['img_pano_mos_diff'] = img_pano_mos_diff
stats_img_vs_pano['img_pano_mos_rel_diff'] = img_pano_mos_rel_diff
stats_img_vs_pano.head()
panoid | heading | weighted_t_test_p | weighted_t_test_significant | significance_level | img_pano_mos_diff | img_pano_mos_rel_diff | |
---|---|---|---|---|---|---|---|
0 | 1Lg37smPwqvAXl4bdDqhlw | 162 | 0.607590 | False | -0.095127 | 0.026137 | |
1 | 1Lg37smPwqvAXl4bdDqhlw | 342 | 0.941152 | False | 0.007523 | 0.002341 | |
2 | GnReLERfph4NUc7ZCnYYIA | 30 | 0.035863 | True | * | -0.298607 | 0.099950 |
3 | GnReLERfph4NUc7ZCnYYIA | 210 | 0.026778 | True | * | 0.315095 | 0.091808 |
4 | J6b4uNaJXJePwh3i4g-J9g | 63 | 0.060193 | False | -0.147668 | 0.040591 |
Add corrected p-values (from R):
r_stats = pd.read_csv(PROCESSED / 'statistics.csv')
# Join stats_img_vs_pano with r_stats. Left: on "{panoid}__{heading}", right: on pov
stats_img_vs_pano['pov'] = stats_img_vs_pano['panoid'] + '__' + stats_img_vs_pano['heading'].astype(str)
merged_stats = stats_img_vs_pano.merge(r_stats[['pov', 'p_adjusted_holm']], on='pov', how='left')
# Rename columns
merged_stats = merged_stats.rename(columns={'p_adjusted_holm': 'weighted_t_test_p', 'weighted_t_test_p': 'weighted_t_test_p_not_corrected'})
# Recalculate the significance level ("weighted_t_test_significant" boolean and "significance_level")
merged_stats['weighted_t_test_significant'] = merged_stats['weighted_t_test_p'] < 0.05
merged_stats['significance_level'] = merged_stats['weighted_t_test_p'].apply(lambda p: '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else '')
merged_stats.head()
panoid | heading | weighted_t_test_p_not_corrected | weighted_t_test_significant | significance_level | img_pano_mos_diff | img_pano_mos_rel_diff | pov | weighted_t_test_p | |
---|---|---|---|---|---|---|---|---|---|
0 | 1Lg37smPwqvAXl4bdDqhlw | 162 | 0.607590 | False | -0.095127 | 0.026137 | 1Lg37smPwqvAXl4bdDqhlw__162 | 1.000000 | |
1 | 1Lg37smPwqvAXl4bdDqhlw | 342 | 0.941152 | False | 0.007523 | 0.002341 | 1Lg37smPwqvAXl4bdDqhlw__342 | 1.000000 | |
2 | GnReLERfph4NUc7ZCnYYIA | 30 | 0.035863 | False | -0.298607 | 0.099950 | GnReLERfph4NUc7ZCnYYIA__30 | 0.824843 | |
3 | GnReLERfph4NUc7ZCnYYIA | 210 | 0.026778 | False | 0.315095 | 0.091808 | GnReLERfph4NUc7ZCnYYIA__210 | 0.696216 | |
4 | J6b4uNaJXJePwh3i4g-J9g | 63 | 0.060193 | False | -0.147668 | 0.040591 | J6b4uNaJXJePwh3i4g-J9g__63 | 1.000000 |
stats_img_vs_pano = merged_stats
print('Significant weighted t-test: {:.2f} %'.format(stats_img_vs_pano['weighted_t_test_significant'].sum() / len(stats_img_vs_pano) * 100))
Significant weighted t-test: 18.75 %
stats_img_vs_pano.loc[stats_img_vs_pano['weighted_t_test_significant'], 'significance_level'].value_counts().sort_index()
significance_level * 4 ** 4 *** 1 Name: count, dtype: int64
Effect Size: Check difference in MOS on significant cases¶
stats_img_vs_pano.loc[stats_img_vs_pano['weighted_t_test_significant'], 'img_pano_mos_diff'].abs().describe()
count 9.000000 mean 0.492379 std 0.118611 min 0.250198 25% 0.425648 50% 0.524238 75% 0.578933 max 0.623508 Name: img_pano_mos_diff, dtype: float64
stats_img_vs_pano.loc[stats_img_vs_pano['weighted_t_test_significant'], 'img_pano_mos_rel_diff'].abs().describe()
count 9.000000 mean 0.170509 std 0.065659 min 0.072794 25% 0.134577 50% 0.171377 75% 0.210820 max 0.280479 Name: img_pano_mos_rel_diff, dtype: float64
Check if the average pano score has an influence on the sign of the diff img-pano¶
We previously strictly compared an image score to the pano score of the same POV. What about this image score to the average pano score?
Better yet: we define a context_score
as the average of the 3 other POVs, rated with the panorama. Now, we compare if the image score is lower or not than the context score. We expect the context_score
to explain the sign of the difference between the img and pano scores, i.e. if img > pano, then the context should be also lower than img. The context is what determines why there is a difference in the pano score.
We create a new df context
.
context = []
for panoid, heading in IMG_TASK_PARAMS:
significant = stats_img_vs_pano[(stats_img_vs_pano['panoid']==panoid) & (stats_img_vs_pano['heading']==heading)]['weighted_t_test_significant'].iloc[0]
img_mos = mos[(mos['panoid']==panoid) & (mos['heading']==heading) & (mos['task_type']=='img')]['zrec_mean'].iloc[0]
other_img_mos = mos[(mos['panoid']==panoid) & (mos['heading']!=heading) & (mos['task_type']=='img')]['zrec_mean'].iloc[0]
pano_mos = mos[(mos['panoid']==panoid) & (mos['heading']==heading) & (mos['task_type']=='pano')]['zrec_mean'].iloc[0]
context_filter = (mos['panoid']==panoid) & (mos['heading']!=heading) & (mos['task_type']=='pano') # Same place, different heading, pano task
context_score = mos.loc[context_filter, 'zrec_mean'].mean() # MOS of the other 3 POVs
pano_headings = (np.array([heading-90, heading+90, heading+180]) + 360) % 360
pano_left = mos[(mos['panoid']==panoid) & (mos['heading']==pano_headings[0]) & (mos['task_type']=='pano')]['zrec_mean'].iloc[0]
pano_right = mos[(mos['panoid']==panoid) & (mos['heading']==pano_headings[1]) & (mos['task_type']=='pano')]['zrec_mean'].iloc[0]
pano_opposed = mos[(mos['panoid']==panoid) & (mos['heading']==pano_headings[2]) & (mos['task_type']=='pano')]['zrec_mean'].iloc[0]
avgpano = mos[(mos['panoid']==panoid) & (mos['task_type']=='pano')]['zrec_mean'].mean()
# PP2
pp2_img = pp2[(pp2['panoid']==panoid) & (pp2['heading']==heading)]['score'].iloc[0]
pp2_left = pp2[(pp2['panoid']==panoid) & (pp2['heading']==pano_headings[0])]['score'].iloc[0]
pp2_right = pp2[(pp2['panoid']==panoid) & (pp2['heading']==pano_headings[1])]['score'].iloc[0]
pp2_opposed = pp2[(pp2['panoid']==panoid) & (pp2['heading']==pano_headings[2])]['score'].iloc[0]
context.append([panoid, heading, img_mos, pano_mos, other_img_mos, pano_left, pano_right, pano_opposed, pp2_img, pp2_left, pp2_right, pp2_opposed, context_score, significant, avgpano])
context = pd.DataFrame(context, columns=['panoid', 'heading', 'img_mos', 'pano_mos', 'other_img_mos', 'pano_left', 'pano_right', 'pano_opposed', 'pp2', 'pp2_left', 'pp2_right', 'pp2_opposed','context_score', 'significant', 'avgpano'])
context['img-context'] = context['img_mos'] - context['context_score']
context['img-pano'] = context['img_mos'] - context['pano_mos']
context['abs(img-pano)'] = (context['img_mos'] - context['pano_mos']).abs()
ctx = context[context['significant']]
context.head()
panoid | heading | img_mos | pano_mos | other_img_mos | pano_left | pano_right | pano_opposed | pp2 | pp2_left | pp2_right | pp2_opposed | context_score | significant | avgpano | img-context | img-pano | abs(img-pano) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1Lg37smPwqvAXl4bdDqhlw | 162 | 3.639504 | 3.734631 | 3.214081 | 3.539598 | 2.994824 | 3.206558 | 4.040594 | 3.671019 | 3.650846 | 3.930964 | 3.246993 | False | 3.368903 | 0.392511 | -0.095127 | 0.095127 |
1 | 1Lg37smPwqvAXl4bdDqhlw | 342 | 3.214081 | 3.206558 | 3.639504 | 2.994824 | 3.539598 | 3.734631 | 3.930964 | 3.650846 | 3.671019 | 4.040594 | 3.423018 | False | 3.368903 | -0.208936 | 0.007523 | 0.007523 |
2 | GnReLERfph4NUc7ZCnYYIA | 30 | 2.987567 | 3.286174 | 3.432103 | 2.713823 | 3.115227 | 3.117008 | 3.791631 | 2.367921 | 3.677613 | 3.177446 | 2.982019 | False | 3.058058 | 0.005548 | -0.298607 | 0.298607 |
3 | GnReLERfph4NUc7ZCnYYIA | 210 | 3.432103 | 3.117008 | 2.987567 | 3.115227 | 2.713823 | 3.286174 | 3.177446 | 3.677613 | 2.367921 | 3.791631 | 3.038408 | False | 3.058058 | 0.393695 | 0.315095 | 0.315095 |
4 | J6b4uNaJXJePwh3i4g-J9g | 63 | 3.637987 | 3.785655 | 2.980566 | 3.329778 | 3.555995 | 3.313977 | 3.585738 | 3.033215 | 3.062261 | 2.524135 | 3.399917 | False | 3.496351 | 0.238070 | -0.147668 | 0.147668 |
On the significant cases, is the sign img - pano the same as img - context? If so, this shows the influence of the context.
We only consider the significant cases, as for the others the sign of the difference is not meaningful.
img_pano = ctx['img_mos'] - ctx['pano_mos']
img_ctx = ctx['img_mos'] - ctx['context_score']
# Are they of the same sign?
print('Influence of context: {:.2f} %'.format(np.average((img_pano * img_ctx) > 0) * 100))
Influence of context: 100.00 %
Task Ambiguity¶
def weighted_avg_std(values, weights):
"""
Function to calculate weighted average and weighted std
:param values: values is an opinion score array with the shape [nb_obs, stimuli]
:param weights: weighting factor for the average and std.
It is expected to be the uncertainty of individual observers.
The expected shape is [nb_obs]
:return: average: weighted average as the recovered mean opinion scores
std: weighted std as the std of the recovered mean opinion scores
"""
average = np.ma.average(values, weights=weights, axis=0)
variance = np.ma.average((values - average) ** 2, weights=weights, axis=0)
std = np.sqrt(variance)
return average, std
mos['sos'] = mos['scores'].apply(lambda x: np.std(x)) # SOS = standard deviation of opinion scores
df = mos.set_index(['panoid', 'heading']).loc[IMG_TASK_PARAMS].copy() # exclude calibration and pano-only tasks
df['task_type'] = df['task_type'].astype('category').cat.rename_categories({'img': 'Image', 'pano': 'Panorama'})
pivot = df.pivot_table(index=['panoid', 'heading'], columns='task_type', values='sos', observed=False)
pivot['diff'] = pivot['Panorama'] - pivot['Image']
pivot['rel_diff'] = pivot['diff'] / pivot['Image'] * 100
plt.figure(figsize=(7,5))
sns.histplot(pivot['rel_diff'], bins=np.arange(-40, 80, 10))
plt.xlabel('Change in ambiguity introduced by the panorama (in %)')
export_figure_paper(plt.gcf(), 'ambiguity-variation.jpg', dpi=300)
plt.show()
Viz all scores¶
Scores (MOS + CI, MOS recovered + CI) vs panoid, grouped by task_type¶
def plot_one_pano(df, significance=[], shift=4, pano_color='orange', img_color='blue', alpha_no_zrec=0.3, ax=None):
"""Plot ZREC_MOS vs heading, grouped by task_type (shited on each side of the heading tick by `shift`).
Also plot the standard MOS in alpha, shifted by `2*shift`
"""
ax_provided = ax is not None
if ax is None:
fig, ax = plt.subplots(figsize=(5,4))
plot_mean_ci_scores(df, ax, shift, pano_color, img_color, alpha=1, prefix='zrec_')
plot_mean_ci_scores(df, ax, 3*shift, pano_color, img_color, alpha=alpha_no_zrec, prefix='')
for heading, significance_level in significance.items():
add_significance(ax, df, heading, significance_level, shift)
xticks = df['heading'].unique().astype('int')
ax.set_xticks(xticks, xticks)
ax.set_title(panoid)
ax.set_ylabel("Mean Score")
ax.set_xlabel("Heading")
ax.set_ylim(0.5, 5.5)
ax.legend()
if not ax_provided:
fig.tight_layout()
plt.show()
return fig, ax
def plot_mean_ci_scores(df, ax, shift, pano_color, img_color, alpha, prefix):
if alpha == 0:
return
xticks_pano, mean_pano, ci_pano = [], [], []
xticks_img, mean_img, ci_img = [], [], []
for h in df['heading'].sort_values().unique():
scores = df[df['heading']==h]
if len(scores) == 2: # both image and pano tasks
xticks_img.append(h-shift)
mean_img.append(scores[scores['task_type']=='img'][prefix+'mean'].iloc[0])
ci_img.append(scores[scores['task_type']=='img'][prefix+'ci95'].iloc[0])
xticks_pano.append(h+shift)
mean_pano.append(scores[scores['task_type']=='pano'][prefix+'mean'].iloc[0])
ci_pano.append(scores[scores['task_type']=='pano'][prefix+'ci95'].iloc[0])
else:
xticks_pano.append(h + shift)
mean_pano.append(scores[prefix+'mean'].iloc[0])
ci_pano.append(scores[prefix+'ci95'].iloc[0])
ax.scatter(xticks_pano, mean_pano, s=15, marker='_', c=pano_color, label=prefix+'pano', alpha=alpha)
ax.errorbar(xticks_pano, mean_pano, yerr=ci_pano, ls='', capsize=3, c=pano_color, alpha=alpha)
ax.scatter(xticks_img, mean_img, s=15, marker='_', c=img_color, label=prefix+'img', alpha=alpha)
ax.errorbar(xticks_img, mean_img, yerr=ci_img, ls='', capsize=3, c=img_color, alpha=alpha)
def add_significance(ax, df, heading, level, x_shift, bar_y_offset=0.15, star_y_offset=0.1, cap_length=0.1):
"""
Add a significance bar with caps and a star between the two task types for a given heading.
Parameters:
- ax: The axes on which to draw.
- df: The dataframe containing the data.
- heading: The heading at which to check for significance.
- bar_y_offset: The vertical offset for the significance bar from the top of the max error bar.
- star_y_offset: The vertical offset for the star from the top of the significance bar.
- cap_length: The length of the vertical caps at the ends of the significance bar.
"""
# Filter the dataframe for the specified heading
subset = df[df['heading'] == heading]
# If there are not exactly 2 points for this heading, return without drawing
if len(subset) != 2:
print(f"Warning: {len(subset)} points found for heading {heading}. Expected 2.")
return
# Calculate the y values for the top of the error bars for the two points
y1 = subset.iloc[0]['zrec_mean'] + subset.iloc[0]['zrec_ci95']
y2 = subset.iloc[1]['zrec_mean'] + subset.iloc[1]['zrec_ci95']
# Calculate the x values for the two points
x1 = heading - x_shift
x2 = heading + x_shift
# Determine the y value for the top of the significance bar
bar_top = max(y1, y2) + bar_y_offset
# Draw the significance bar
ax.plot([x1, x2], [bar_top, bar_top], color='black', lw=1)
# Draw the caps at the ends of the significance bar
ax.plot([x1, x1], [bar_top - cap_length, bar_top], color='black', lw=1)
ax.plot([x2, x2], [bar_top - cap_length, bar_top], color='black', lw=1)
# Draw the star above the center of the significance bar
ax.text((x1 + x2) / 2, bar_top + star_y_offset, level, ha='center', va='center')
def get_significance(panoid):
df = stats_img_vs_pano[(stats_img_vs_pano['panoid']==panoid) & (stats_img_vs_pano['weighted_t_test_significant'])]
return df[['heading', 'significance_level']].set_index('heading').to_dict()['significance_level']
n_cols = 3
n_rows = int(np.ceil(len(PANOIDS_TASKS) / 3))
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*4))
for panoid, ax in zip(PANOIDS_TASKS, axs.ravel()):
df = mos[mos['panoid']==panoid]
significance = get_significance(panoid)
plot_one_pano(df, significance=significance, pano_color='orange', img_color='blue', alpha_no_zrec=0.4, ax=ax)
fig.tight_layout()
export_figure_paper(fig, 'mos-zrec-vs-standard.jpg', dpi=400)
Mosaic with streetview images (SVI images are missing as we are not able to release the images):
def create_split_array(n_rows_total, n_split):
# Calculate the common value for each element except the last one
common_value = n_rows_total // n_split
# Create the array with the common value
split_array = [common_value] * n_split
# Adjust the last element to ensure the sum equals n_rows_total
split_array[-1] += n_rows_total - sum(split_array)
return split_array
def add_svi(panoid, heading, ax):
heading_str = str(int(heading))
ax.axis('off')
ax.set_title(heading_str + '°')
# Make a mosaic with a panoid in each plot
n_cols = 5
n_split = 3 # Split in multiple figures to include in the paper
panoids = PANOIDS_TASKS
data = mos.copy()
data = mos[mos['panoid'].isin(panoids)]
n_rows_total = len(panoids) + 1 # +1 for the reference images
figs = []
axes_list = []
n_rows_array = create_split_array(n_rows_total, n_split)
for n_rows in n_rows_array:
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*4, n_rows*4))
figs.append(fig)
axes_list.append(axes)
axes = np.concatenate(axes_list)
for panoid, axs in zip(panoids, axes[1:]):
df = data[data['panoid']==panoid]
significance = get_significance(panoid)
plot_one_pano(df, significance=significance, pano_color='orange', img_color='blue', alpha_no_zrec=0, ax=axs[0])
headings = df['heading'].unique()
headings = np.sort(headings)
for heading, ax in zip(headings, axs[1:]):
add_svi(panoid, heading, ax)
# Add calibration score
ax = axes[0, 0]
mos_calib = mos[mos['panoid'].isin(PANOIDS_CALIBRATION)].sort_values('zrec_mean') # Lower score was first
ref1_mos, ref1_ci = mos_calib['zrec_mean'].iloc[0], mos_calib['zrec_ci95'].iloc[0]
ref2_mos, ref2_ci = mos_calib['zrec_mean'].iloc[1], mos_calib['zrec_ci95'].iloc[1]
ax.scatter([-1], ref1_mos, s=15, marker='_', c='magenta')
ax.errorbar([-1], ref1_mos, yerr=ref1_ci, ls='', capsize=3, c='magenta')
ax.scatter([1], ref2_mos, s=15, marker='_', c='green')
ax.errorbar([1], ref2_mos, yerr=ref2_ci, ls='', capsize=3, c='green')
ax.set_xticks([-1,1], ['Ref 1', 'Ref 2'])
ax.set_xlim(-7, 7)
ax.set_ylim(0.5, 5.5)
ax.set_title('Reference Images')
ax.set_ylabel('Mean Score')
# Add calibration images
add_svi(mos_calib['panoid'].iloc[0], mos_calib['heading'].iloc[0], axes[0, 1])
axes[0, 1].set_title('Ref 1')
add_svi(mos_calib['panoid'].iloc[1], mos_calib['heading'].iloc[1], axes[0, 2])
axes[0, 2].set_title('Ref 2')
axes[0, 3].axis('off')
axes[0, 4].axis('off')
for i, fig in enumerate(figs):
fig.tight_layout()
export_figure_paper(fig, f'all-scores-comparison-img-pano-with-svi-{i}.jpg', dpi=300)
Smaller version for the body of the paper:
n_cols = 5
n_split = 1
panoids = ['ToX0MGNWH-VsUEqq5wSCzQ', 'aOT4Hl_n33HvBXyWpvYb4Q', 'Wv_rNo6f8bMLVKkHdZotfg', 'tlPLzx1D7MRgcvFowbmWGw']
data = mos.copy()
data = mos[mos['panoid'].isin(panoids)]
n_rows_total = len(panoids)
figs = []
axes_list = []
n_rows_array = create_split_array(n_rows_total, n_split)
for n_rows in n_rows_array:
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*4, n_rows*4))
figs.append(fig)
axes_list.append(axes)
axes = np.concatenate(axes_list)
for panoid, axs in zip(panoids, axes):
df = data[data['panoid']==panoid]
significance = get_significance(panoid)
plot_one_pano(df, significance=significance, pano_color='orange', img_color='blue', alpha_no_zrec=0, ax=axs[0])
headings = df['heading'].unique()
headings = np.sort(headings)
for heading, ax in zip(headings, axs[1:]):
add_svi(panoid, heading, ax)
for i, fig in enumerate(figs):
fig.tight_layout()
export_figure_paper(fig, f'scores-comparison-img-pano-with-svi-{i}.jpg', dpi=300)
Comparison with PP2¶
POV-Level: Relationship between ZREC_MOS and the score predicted by RSSCNN¶
merged_df = pp2.merge(mos, on=['panoid', 'heading'], how='left')
img_data = merged_df[merged_df['task_type'] == 'img']
ax = plot_regression_and_print_coefficients(img_data, 'score', 'zrec_mean', 'Image Scores vs RSSCNN Score')
ax.set_ylabel('MOS')
ax.set_xlabel('RSSCNN Score')
ax.set_xlim(1.5, 5.2)
ax.set_ylim(1.5, 5.2)
export_figure_paper(ax.figure, 'up2-vs-pp2-images.jpg', dpi=300)
Slope: 0.5373 Intercept: 1.4055 R^2: 0.5370 ------------------------
Place-Level: Relationship between the 4 ZREC_MOS and the 4 scores predicted by RSSCNN¶
def make_up2_pp2_df(up2_series, pp2_series):
df = up2_series.to_frame().join(pp2_series.to_frame(), on='panoid')
df.rename(columns={'score': 'pp2_score', 'zrec_mean': 'up2_score'}, inplace=True)
df['diff'] = df['up2_score'] - df['pp2_score']
return df
mos_pano = mos[mos['task_type']=='pano'].copy()
mos_pano_gpanoid = mos_pano.groupby('panoid')
pp2_gpanoid = pp2.groupby('panoid')
df = make_up2_pp2_df(mos_pano_gpanoid['zrec_mean'].agg('std'), pp2_gpanoid['score'].agg('std'))
fig, ax = plt.subplots(figsize=(7, 6))
sns.scatterplot(data=df, x='pp2_score', y='up2_score', ax=ax)
xmin, xmax = ax.get_xlim()
ymin, ymax = ax.get_ylim()
min_ = min(xmin, ymin)
max_ = max(xmax, ymax)
ax.plot([min_, max_], [min_, max_], ls='--', color='red')
ax.set_xlim(min_, max_)
ax.set_ylim(min_, max_)
export_figure_paper(ax.figure, 'up2-vs-pp2-panorama-std.jpg', dpi=300)
plt.show()
Completion time¶
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
sns.boxplot(x='index_in_run', y='time', data=img_scores, ax=ax[0], color=sns.color_palette()[0])
ax[0].set_title('Image tasks')
ax[0].set_ylabel('Completion time (seconds)')
ax[0].set_xlabel('Task index')
sns.boxplot(x='index_in_run', y='time', data=pano_scores, ax=ax[1], color=sns.color_palette()[0])
ax[1].set_title('Panorama tasks')
ax[1].set_ylabel('Completion time (seconds)')
ax[1].set_xlabel('Task index')
export_figure_paper(fig, 'completion-time-tasks.png')
plt.show()
Some numerical values:
runs['time'].describe()
count 329.000000 mean 304.130699 std 113.395775 min 137.000000 25% 224.000000 50% 283.000000 75% 354.000000 max 927.000000 Name: time, dtype: float64
img_scores['time'].describe()
count 1940.000000 mean 16.728351 std 10.592013 min 5.000000 25% 10.000000 50% 14.000000 75% 20.000000 max 83.000000 Name: time, dtype: float64
img_scores.groupby('index_in_run')['time'].mean()
index_in_run 0 25.802469 1 17.158055 2 16.478261 4 14.302469 5 12.720126 7 13.817337 Name: time, dtype: float64
pano_scores['time'].describe()
count 650.000000 mean 74.749231 std 27.842008 min 36.000000 25% 55.000000 50% 68.000000 75% 87.000000 max 222.000000 Name: time, dtype: float64
pano_scores.groupby('index_in_run')['time'].mean()
index_in_run 3 83.103976 6 66.291022 Name: time, dtype: float64
Descriptive data of the sample¶
Age¶
fig, ax = plt.subplots()
sns.countplot(users, x='age_range', ax=ax, color=sns.color_palette()[0])
ax.set_xlabel('Age Group')
ax.set_ylabel('Count')
# Rotate xticks
ax.set_xticks(ax.get_xticks()) # to avoid warnings
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode='anchor')
export_figure_paper(fig, 'age-distribution.png')
plt.show()
Gender¶
fig, ax = plt.subplots()
sns.countplot(users, x='gender', ax=ax, color=sns.color_palette()[0])
ax.set_xlabel('Gender')
ax.set_ylabel('Count')
# Rotate xticks
ax.set_xticks(ax.get_xticks()) # to avoid warnings
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode='anchor')
export_figure_paper(fig, 'gender-distribution.png')
plt.show()
Level of education¶
fig, ax = plt.subplots()
order = ['no degree', 'high school', 'bachelor', 'master', 'doctorate']
sns.countplot(users, x='level_of_education', ax=ax, order=order, color=sns.color_palette()[0])
ax.set_xlabel('Level of Education')
ax.set_ylabel('Count')
# Rotate xticks
ax.set_xticks(ax.get_xticks()) # to avoid warnings
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode='anchor')
export_figure_paper(fig, 'level-of-education-distribution.png')
plt.show()