Clean data¶

In [1]:
from pathlib import Path

import pandas as pd
import seaborn as sns

sns.set_theme()
In [2]:
DATA = Path('..') / 'data'
RAW = DATA / 'raw'
PROCESSED = DATA / 'processed'
PROCESSED.mkdir(exist_ok=True)

Load data¶

In [3]:
panos = pd.read_json(RAW / 'panorama_scores.json')
imgs = pd.read_json(RAW / 'image_scores.json')
runs = pd.read_json(RAW / 'runs.json')
users = pd.read_json(RAW / 'users.json')

print(f'panos ({len(panos)}):')
display(panos.head(3))
print(f'images ({len(imgs)}):')
display(imgs.head(3))
print(f'runs ({len(runs)}):')
display(runs.head(3))
print(f'users ({len(users)}):')
display(users.head(3))
panos (721):
panoid time_started time_finished scores headings familiarity run_id index_in_run task_id run_template_id user_id
0 ToX0MGNWH-VsUEqq5wSCzQ 2023-02-02T23:00:51.256000Z 2023-02-02T23:02:15.021000Z [2.0, 4.0, 1.5, 1.5] [167, 257, 347, 77] 1 84 3 668 10 75
1 _zI03ND3kqbk0lyGr6va_A 2023-02-02T23:02:40.071000Z 2023-02-02T23:03:40.548000Z [4.3, 3.5, 3.3, 3.7] [313, 43, 133, 223] 1 84 6 667 10 75
2 ToX0MGNWH-VsUEqq5wSCzQ 2023-01-31T15:20:40.194000Z 2023-01-31T15:22:26.241000Z [2.0, 1.5, 5.0, 4.0] [347, 77, 167, 257] 1 77 1 612 10 45
images (2130):
panoid heading time_started time_finished score familiarity run_id index_in_run task_id run_template_id user_id
0 yGy5iCyoO0PbL2JaR9EggQ 346 2023-02-02T23:00:23.057000Z 2023-02-02T23:00:41.429000Z 1.5 1 84 1 666 10 75
1 Wv_rNo6f8bMLVKkHdZotfg 142 2023-02-02T23:00:41.512000Z 2023-02-02T23:00:51.156000Z 4.5 1 84 2 671 10 75
2 1Lg37smPwqvAXl4bdDqhlw 162 2023-02-02T23:02:15.165000Z 2023-02-02T23:02:28.510000Z 4.5 1 84 4 670 10 75
runs (361):
id time_started time_finished run_template_id user_id img_task_ids pano_task_ids
0 84 2023-02-02T22:59:11.602033Z 2023-02-02T23:03:44.914786Z 10 75 [666, 671, 670, 672, 665, 669] [668, 667]
1 77 2023-01-31T15:20:17.043551Z 2023-01-31T15:24:06.603746Z 10 45 [613, 616, 614, 615] [612, 611]
2 361 2023-03-14T14:40:56.078633Z 2023-03-14T14:45:08.342496Z 3 143 [2883, 2890, 2884, 2887, 2885, 2888] [2886, 2889]
users (342):
id run_ids age_range gender level_of_education location
0 39 [43] 25-29 male doctorate FR
1 25 [29] 45-49 male doctorate FR
2 20 [23] 25-29 female doctorate FR
In [4]:
from datetime import datetime

def fmt_timedelta(td):
    #return '{:02}:{:02}:{:02}'.format(td.seconds // 3600, (td.seconds // 60) % 60, td.seconds % 60)
    return td.seconds

def parse_datetime(dt):
    try:
        return datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S.%fZ')
    except ValueError:
        try:
            return datetime.strptime(dt, '%Y-%m-%dT%H:%M:%SZ')
        except ValueError:
            raise ValueError('Invalid datetime format: {}'.format(dt))

def calculate_time_spent(start, finish, fmt=True):
    if start is not None and finish is not None:
        time = parse_datetime(finish) - parse_datetime(start)
        if fmt:
            time = fmt_timedelta(time)
    else:
        if fmt:
            time = '--:--:--'
        else:
            time = None
    return time

imgs['time'] = imgs.apply(lambda x: calculate_time_spent(x['time_started'], x['time_finished']), axis=1)
panos['time'] = panos.apply(lambda x: calculate_time_spent(x['time_started'], x['time_finished']), axis=1)
runs['time'] = runs.apply(lambda x: calculate_time_spent(x['time_started'], x['time_finished']), axis=1)

Filter by time¶

Pano¶

In [5]:
high_cutoff_pano = 240  # seconds
low_cutoff_pano = 35
print(f'Number of panos with time > {high_cutoff_pano}: {len(panos[panos["time"] > high_cutoff_pano])}')
print(f'Number of panos with time < {low_cutoff_pano}: {len(panos[panos["time"] < low_cutoff_pano])}')

panos[(low_cutoff_pano <= panos['time']) & (panos['time']<= high_cutoff_pano)]['time'].hist();
Number of panos with time > 240: 10
Number of panos with time < 35: 2
No description has been provided for this image

Apply filter (remove from both panos and runs):

In [6]:
print('n pano tasks before:', runs['pano_task_ids'].apply(len).sum())
filter = (low_cutoff_pano <= panos['time']) & (panos['time']<= high_cutoff_pano)
for _, row in panos[~filter][['run_id', 'task_id']].iterrows():
    run_id, task_id = row['run_id'], row['task_id']
    runs[runs['id']==run_id]['pano_task_ids'].iloc[0].remove(task_id)
print('n pano tasks after:', runs['pano_task_ids'].apply(len).sum())

panos = panos[filter]
n pano tasks before: 721
n pano tasks after: 709

Img¶

In [7]:
high_cutoff_img = 90  # seconds
low_cutoff_img = 5
print(f'Number of images with time > {high_cutoff_img}: {len(imgs[imgs["time"] > high_cutoff_img])}')
print(f'Number of images with time < {low_cutoff_img}: {len(imgs[imgs["time"] < low_cutoff_img])}')

imgs[(imgs['time'] >= low_cutoff_img) & (imgs['time'] <= high_cutoff_img)]['time'].hist();
Number of images with time > 90: 9
Number of images with time < 5: 28
No description has been provided for this image

Apply filter

In [8]:
print('n img tasks before:', runs['img_task_ids'].apply(len).sum())
filter = (low_cutoff_img <= imgs['time']) & (imgs['time']<= high_cutoff_img)
for _, row in imgs[~filter][['run_id', 'task_id']].iterrows():
    run_id, task_id = row['run_id'], row['task_id']
    runs[runs['id']==run_id]['img_task_ids'].iloc[0].remove(task_id)
print('n img tasks after:', runs['img_task_ids'].apply(len).sum())

imgs = imgs[filter]
n img tasks before: 2129
n img tasks after: 2092

Run¶

In [9]:
cutoff_run = 1200  # seconds, i.e. 20 minutes
print(f'Number of runs with time > {cutoff_run}: {len(runs[runs["time"] > cutoff_run])}')

runs[runs['time'] <= cutoff_run]['time'].hist();
Number of runs with time > 1200: 13
No description has been provided for this image

Apply filter

In [10]:
# Delete data from imgs, panos, panos_long
for _, run_row in runs[runs['time'] > cutoff_run].iterrows():
    run_id = run_row['id']
    imgs = imgs[imgs['run_id'] != run_id]
    panos = panos[panos['run_id'] != run_id]

# Delete from runs
runs = runs[runs['time'] <= cutoff_run]
In [11]:
print(f'Number of runs: {len(runs)}')
print(f'Number of images: {len(imgs)}')
print(f'Number of panos: {len(panos)}')
Number of runs: 348
Number of images: 2016
Number of panos: 685

Remove multiples runs by same user¶

We'll keep only the first one (sorted by time).

In [12]:
print('n_runs before:', len(runs))
print('n imgs tasks before:', len(imgs))
print('n pano tasks before:', len(panos))
runs_df = runs.sort_values(by=['user_id', 'time_started'])

# Step 2: Mark duplicates, keeping the first run for each user
duplicates = runs_df.duplicated(subset='user_id', keep='first')

# Step 3: Filter out duplicates, keeping only the first run of each user
runs_to_remove = runs_df[duplicates]['id'].tolist()

print("Run IDs to remove:", runs_to_remove)

runs = runs[~runs['id'].isin(runs_to_remove)]
imgs = imgs[~imgs['run_id'].isin(runs_to_remove)]
panos = panos[~panos['run_id'].isin(runs_to_remove)]

print()
print('n_runs after:', len(runs))
print('n imgs tasks after:', len(imgs))
print('n pano tasks after:', len(panos))
n_runs before: 348
n imgs tasks before: 2016
n pano tasks before: 685
Run IDs to remove: [222, 52, 657, 77, 78, 151, 320, 322, 404, 153, 227, 230, 231, 366, 384, 232, 473, 474, 564]

n_runs after: 329
n imgs tasks after: 1940
n pano tasks after: 650

Filter users¶

Filter users to keep only those that are present in the run dataframe.

In [13]:
user_ids = runs['user_id'].unique()
users_filtered = users[users['id'].isin(user_ids)]
print(len(users_filtered))  # should be the same as len(runs) since duplicated runs were eliminated
329

Export¶

In [14]:
panos.to_json(PROCESSED / 'panorama_scores.json', orient='records')
imgs.to_json(PROCESSED / 'image_scores.json', orient='records')
runs.to_json(PROCESSED / 'runs.json', orient='records')
users_filtered.to_json(PROCESSED / 'users.json', orient='records')