Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion data/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ MECOL2W2/*
!MECOL2W2/folds_metadata
MECOL2/*
!MECOL2/folds_metadata
!MECOL2/stimuli
IITBHGC/*
!IITBHGC/folds_metadata

Expand Down
13 changes: 0 additions & 13 deletions data/MECOL2/stimuli/stimuli.csv

This file was deleted.

2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ dependencies:
- peft=0.15.2
- spacy=3.8.5
- pytorch-metric-learning=2.9.0
- rdata=1.0.0
- pip:
- -e . # For development purposes
- git+https://github.com/aeye-lab/pymovements.git@siqube-stack-pr
- git+https://github.com/lacclab/text-metrics.git
- en_core_web_sm@https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
- da_core_news_sm@https://github.com/explosion/spacy-models/releases/download/da_core_news_sm-3.8.0/da_core_news_sm-3.8.0-py3-none-any.whl
- de_core_news_sm@https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl
- juliacall
26 changes: 26 additions & 0 deletions src/data/preprocessing/download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pathlib import Path

import pymovements as pm
import rdata
import requests
from loguru import logger
from tqdm import tqdm
Expand All @@ -15,6 +16,7 @@
DataSets.MECO_L2: { # Hosted on MECO L2: The Multilingual Eye-movement COrpus, L2 (English) - https://osf.io/q9h43
'MECOL2W1/demographics/joint.ind.diff.l2.rda': '4zu8d',
'MECOL2W2/demographics/joint.ind.diff.l2.w2.rda': 'keuvm',
'MECOL2/stimuli/texts.meco.l2.rda': 'zwfdb',
},
}

Expand Down Expand Up @@ -44,6 +46,29 @@ def download_auxiliary_files(root: Path, dataset_name: str) -> None:
fp.write(chunk)


def convert_rda_to_csv(root: Path, dataset_name: str) -> None:
"""Convert RDA files to CSV for specific datasets."""
if dataset_name != DataSets.MECO_L2:
return
rda_path = root / 'MECOL2/stimuli/texts.meco.l2.rda'
csv_path = root / 'MECOL2/stimuli/stimuli.csv'

if csv_path.exists():
logger.info(f'{csv_path} already exists. Skipping conversion...')
return

if not rda_path.exists():
logger.warning(f'{rda_path} not found. Skipping conversion...')
return

logger.info(f'Converting {rda_path} to {csv_path}')
rda_data = rdata.read_rda(rda_path)
df = rda_data['d']
csv_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(csv_path, index=False)
logger.info(f'Saved stimuli CSV to {csv_path}')


def prepare_dataset_definition(dataset_name: str):
"""Prepare dataset definition with gaze files disabled."""
dataset_def = pm.DatasetLibrary.get(dataset_name)
Expand Down Expand Up @@ -110,6 +135,7 @@ def main() -> int:
load_or_download_dataset(dataset_name, data_path, download=True)

download_auxiliary_files(data_path, dataset_name)
convert_rda_to_csv(data_path, dataset_name)

return 0

Expand Down
Loading