Clean data¶
In [1]:
from pathlib import Path
import pandas as pd
import seaborn as sns
sns.set_theme()
In [2]:
DATA = Path('..') / 'data'
RAW = DATA / 'raw'
PROCESSED = DATA / 'processed'
PROCESSED.mkdir(exist_ok=True)
Load data¶
In [3]:
panos = pd.read_json(RAW / 'panorama_scores.json')
imgs = pd.read_json(RAW / 'image_scores.json')
runs = pd.read_json(RAW / 'runs.json')
users = pd.read_json(RAW / 'users.json')
print(f'panos ({len(panos)}):')
display(panos.head(3))
print(f'images ({len(imgs)}):')
display(imgs.head(3))
print(f'runs ({len(runs)}):')
display(runs.head(3))
print(f'users ({len(users)}):')
display(users.head(3))
panos (721):
panoid | time_started | time_finished | scores | headings | familiarity | run_id | index_in_run | task_id | run_template_id | user_id | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | ToX0MGNWH-VsUEqq5wSCzQ | 2023-02-02T23:00:51.256000Z | 2023-02-02T23:02:15.021000Z | [2.0, 4.0, 1.5, 1.5] | [167, 257, 347, 77] | 1 | 84 | 3 | 668 | 10 | 75 |
1 | _zI03ND3kqbk0lyGr6va_A | 2023-02-02T23:02:40.071000Z | 2023-02-02T23:03:40.548000Z | [4.3, 3.5, 3.3, 3.7] | [313, 43, 133, 223] | 1 | 84 | 6 | 667 | 10 | 75 |
2 | ToX0MGNWH-VsUEqq5wSCzQ | 2023-01-31T15:20:40.194000Z | 2023-01-31T15:22:26.241000Z | [2.0, 1.5, 5.0, 4.0] | [347, 77, 167, 257] | 1 | 77 | 1 | 612 | 10 | 45 |
images (2130):
panoid | heading | time_started | time_finished | score | familiarity | run_id | index_in_run | task_id | run_template_id | user_id | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | yGy5iCyoO0PbL2JaR9EggQ | 346 | 2023-02-02T23:00:23.057000Z | 2023-02-02T23:00:41.429000Z | 1.5 | 1 | 84 | 1 | 666 | 10 | 75 |
1 | Wv_rNo6f8bMLVKkHdZotfg | 142 | 2023-02-02T23:00:41.512000Z | 2023-02-02T23:00:51.156000Z | 4.5 | 1 | 84 | 2 | 671 | 10 | 75 |
2 | 1Lg37smPwqvAXl4bdDqhlw | 162 | 2023-02-02T23:02:15.165000Z | 2023-02-02T23:02:28.510000Z | 4.5 | 1 | 84 | 4 | 670 | 10 | 75 |
runs (361):
id | time_started | time_finished | run_template_id | user_id | img_task_ids | pano_task_ids | |
---|---|---|---|---|---|---|---|
0 | 84 | 2023-02-02T22:59:11.602033Z | 2023-02-02T23:03:44.914786Z | 10 | 75 | [666, 671, 670, 672, 665, 669] | [668, 667] |
1 | 77 | 2023-01-31T15:20:17.043551Z | 2023-01-31T15:24:06.603746Z | 10 | 45 | [613, 616, 614, 615] | [612, 611] |
2 | 361 | 2023-03-14T14:40:56.078633Z | 2023-03-14T14:45:08.342496Z | 3 | 143 | [2883, 2890, 2884, 2887, 2885, 2888] | [2886, 2889] |
users (342):
id | run_ids | age_range | gender | level_of_education | location | |
---|---|---|---|---|---|---|
0 | 39 | [43] | 25-29 | male | doctorate | FR |
1 | 25 | [29] | 45-49 | male | doctorate | FR |
2 | 20 | [23] | 25-29 | female | doctorate | FR |
In [4]:
from datetime import datetime
def fmt_timedelta(td):
#return '{:02}:{:02}:{:02}'.format(td.seconds // 3600, (td.seconds // 60) % 60, td.seconds % 60)
return td.seconds
def parse_datetime(dt):
try:
return datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S.%fZ')
except ValueError:
try:
return datetime.strptime(dt, '%Y-%m-%dT%H:%M:%SZ')
except ValueError:
raise ValueError('Invalid datetime format: {}'.format(dt))
def calculate_time_spent(start, finish, fmt=True):
if start is not None and finish is not None:
time = parse_datetime(finish) - parse_datetime(start)
if fmt:
time = fmt_timedelta(time)
else:
if fmt:
time = '--:--:--'
else:
time = None
return time
imgs['time'] = imgs.apply(lambda x: calculate_time_spent(x['time_started'], x['time_finished']), axis=1)
panos['time'] = panos.apply(lambda x: calculate_time_spent(x['time_started'], x['time_finished']), axis=1)
runs['time'] = runs.apply(lambda x: calculate_time_spent(x['time_started'], x['time_finished']), axis=1)
Filter by time¶
Pano¶
In [5]:
high_cutoff_pano = 240 # seconds
low_cutoff_pano = 35
print(f'Number of panos with time > {high_cutoff_pano}: {len(panos[panos["time"] > high_cutoff_pano])}')
print(f'Number of panos with time < {low_cutoff_pano}: {len(panos[panos["time"] < low_cutoff_pano])}')
panos[(low_cutoff_pano <= panos['time']) & (panos['time']<= high_cutoff_pano)]['time'].hist();
Number of panos with time > 240: 10 Number of panos with time < 35: 2
Apply filter (remove from both panos
and runs
):
In [6]:
print('n pano tasks before:', runs['pano_task_ids'].apply(len).sum())
filter = (low_cutoff_pano <= panos['time']) & (panos['time']<= high_cutoff_pano)
for _, row in panos[~filter][['run_id', 'task_id']].iterrows():
run_id, task_id = row['run_id'], row['task_id']
runs[runs['id']==run_id]['pano_task_ids'].iloc[0].remove(task_id)
print('n pano tasks after:', runs['pano_task_ids'].apply(len).sum())
panos = panos[filter]
n pano tasks before: 721 n pano tasks after: 709
Img¶
In [7]:
high_cutoff_img = 90 # seconds
low_cutoff_img = 5
print(f'Number of images with time > {high_cutoff_img}: {len(imgs[imgs["time"] > high_cutoff_img])}')
print(f'Number of images with time < {low_cutoff_img}: {len(imgs[imgs["time"] < low_cutoff_img])}')
imgs[(imgs['time'] >= low_cutoff_img) & (imgs['time'] <= high_cutoff_img)]['time'].hist();
Number of images with time > 90: 9 Number of images with time < 5: 28
Apply filter
In [8]:
print('n img tasks before:', runs['img_task_ids'].apply(len).sum())
filter = (low_cutoff_img <= imgs['time']) & (imgs['time']<= high_cutoff_img)
for _, row in imgs[~filter][['run_id', 'task_id']].iterrows():
run_id, task_id = row['run_id'], row['task_id']
runs[runs['id']==run_id]['img_task_ids'].iloc[0].remove(task_id)
print('n img tasks after:', runs['img_task_ids'].apply(len).sum())
imgs = imgs[filter]
n img tasks before: 2129 n img tasks after: 2092
Run¶
In [9]:
cutoff_run = 1200 # seconds, i.e. 20 minutes
print(f'Number of runs with time > {cutoff_run}: {len(runs[runs["time"] > cutoff_run])}')
runs[runs['time'] <= cutoff_run]['time'].hist();
Number of runs with time > 1200: 13
Apply filter
In [10]:
# Delete data from imgs, panos, panos_long
for _, run_row in runs[runs['time'] > cutoff_run].iterrows():
run_id = run_row['id']
imgs = imgs[imgs['run_id'] != run_id]
panos = panos[panos['run_id'] != run_id]
# Delete from runs
runs = runs[runs['time'] <= cutoff_run]
In [11]:
print(f'Number of runs: {len(runs)}')
print(f'Number of images: {len(imgs)}')
print(f'Number of panos: {len(panos)}')
Number of runs: 348 Number of images: 2016 Number of panos: 685
Remove multiples runs by same user¶
We'll keep only the first one (sorted by time).
In [12]:
print('n_runs before:', len(runs))
print('n imgs tasks before:', len(imgs))
print('n pano tasks before:', len(panos))
runs_df = runs.sort_values(by=['user_id', 'time_started'])
# Step 2: Mark duplicates, keeping the first run for each user
duplicates = runs_df.duplicated(subset='user_id', keep='first')
# Step 3: Filter out duplicates, keeping only the first run of each user
runs_to_remove = runs_df[duplicates]['id'].tolist()
print("Run IDs to remove:", runs_to_remove)
runs = runs[~runs['id'].isin(runs_to_remove)]
imgs = imgs[~imgs['run_id'].isin(runs_to_remove)]
panos = panos[~panos['run_id'].isin(runs_to_remove)]
print()
print('n_runs after:', len(runs))
print('n imgs tasks after:', len(imgs))
print('n pano tasks after:', len(panos))
n_runs before: 348 n imgs tasks before: 2016 n pano tasks before: 685 Run IDs to remove: [222, 52, 657, 77, 78, 151, 320, 322, 404, 153, 227, 230, 231, 366, 384, 232, 473, 474, 564] n_runs after: 329 n imgs tasks after: 1940 n pano tasks after: 650
Filter users¶
Filter users to keep only those that are present in the run
dataframe.
In [13]:
user_ids = runs['user_id'].unique()
users_filtered = users[users['id'].isin(user_ids)]
print(len(users_filtered)) # should be the same as len(runs) since duplicated runs were eliminated
329
Export¶
In [14]:
panos.to_json(PROCESSED / 'panorama_scores.json', orient='records')
imgs.to_json(PROCESSED / 'image_scores.json', orient='records')
runs.to_json(PROCESSED / 'runs.json', orient='records')
users_filtered.to_json(PROCESSED / 'users.json', orient='records')