https://github.com/EvolvingLMMs-Lab/EgoLife/tree/main/EgoRAG#method-1-create-from-json-fast
Does the "captions for each video" mentioned here means the srt files? But there are too many captions, and ChromaDB build really slow. Building indexes for A1_JAKE takes 3 days. There might be some simpler version of captions I think?
The following is my srt to json script
`
import os
import re
import json
from pathlib import Path
SRT_TIMESTAMP_RE = re.compile(r"^(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-->\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})$")
def hms_to_ms(h: int, m: int, s: int, ms: int) -> int:
return ((h * 60 + m) * 60 + s) * 1000 + ms
def hms_to_8digit(h: int, m: int, s: int, ms: int) -> int:
return h * 1000000 + m * 10000 + s * 100 + ms // 10
def parse_srt(path: Path):
entries = []
with open(path, 'r', encoding='utf-8') as f:
lines = [line.rstrip('\n') for line in f]
i = 0
while i < len(lines):
# Skip index line (number)
if lines[i].strip().isdigit():
i += 1
if i >= len(lines):
break
# Timestamp line
m = SRT_TIMESTAMP_RE.match(lines[i].strip())
if not m:
i += 1
continue
i += 1
sh, sm, ss, sms, eh, em, es, ems = map(int, m.groups())
# Caption text lines until blank
text_lines = []
while i < len(lines) and lines[i].strip() != '':
text_lines.append(lines[i].strip())
i += 1
# consume blank line
while i < len(lines) and lines[i].strip() == '':
i += 1
entries.append({
'start_8digit': hms_to_8digit(sh, sm, ss, sms),
'end_8digit': hms_to_8digit(eh, em, es, ems),
'text': ' '.join(text_lines),
})
return entries
def extract_video_start_ms(filename: str) -> int:
# Find the last continuous digits in filename before .mp4
base = os.path.splitext(os.path.basename(filename))[0]
nums = re.findall(r"(\d{6,})", base)
return int(nums[-1]) if nums else -1
def extract_video_start_8digit(filename: str) -> int:
# Find the last continuous 8-digit number in filename before .mp4
base = os.path.splitext(os.path.basename(filename))[0]
nums = re.findall(r"(\d{8})", base)
return int(nums[-1]) if nums else -1
def convert_srt_to_json(srt_file: Path, output_dir: Path, video_dir: Path = None):
# Expect filename like A1_JAKE_DAY1_11000000.srt
name = srt_file.stem # e.g., A1_JAKE_DAY1_11000000
parts = name.split('')
if len(parts) < 3:
raise ValueError(f'Unexpected SRT filename format: {srt_file.name}')
persona = ''.join(parts[:2]) # A1_JAKE
date = parts[2] # DAY1
try:
base_ms = int(parts[3]) if len(parts) > 3 else 0
except ValueError:
base_ms = 0
cues = parse_srt(srt_file)
# Prepare video index from the provided video_dir (e.g., /mnt/data/raw_data/EgoLife/A1_JAKE/DAY1)
video_index = []
if video_dir and Path(video_dir).is_dir():
for vf in Path(video_dir).glob('*.mp4'):
start_8digit = extract_video_start_8digit(vf.name)
if start_8digit >= 0:
video_index.append((start_8digit, vf.name))
video_index.sort(key=lambda x: x[0])
def locate_video_filename(ts_8digit: int) -> str:
if not video_index:
# Fallback to constructed name if index missing
return f"{date}_{persona}_{ts_8digit}.mp4"
# find last video whose start_8digit <= ts_8digit
chosen = video_index[0][1]
for start_8digit, name in video_index:
if start_8digit <= ts_8digit:
chosen = name
else:
break
return chosen
result = []
for cue in cues:
start_time = str(base_ms + cue['start_8digit'])
end_time = str(base_ms + cue['end_8digit'])
# Choose the actual video file containing the start_time
video_filename = locate_video_filename(int(start_time))
video_path = f"/mnt/data/raw_data/EgoLife/{persona}/{date}/{video_filename}"
result.append({
'start_time': start_time,
'end_time': end_time,
'text': cue['text'],
'date': date,
'video_path': video_path,
})
output_dir.mkdir(parents=True, exist_ok=True)
out_file = output_dir / (name + '.json')
with open(out_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
return out_file
def summarize_day(persona, date=None):
# persona may be a list of entries (as used elsewhere) or a persona string with a date to read files
if isinstance(persona, list):
entries = persona
else:
if date is None:
return {'captions': [], 'count': 0}
folder = Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption') / persona / date
entries = []
if folder.is_dir():
for jf in sorted(folder.glob('*.json')):
try:
with open(jf, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
entries.extend(data)
elif isinstance(data, dict):
# handle possible wrapped structure
if 'captions' in data and isinstance(data['captions'], list):
entries.extend(data['captions'])
else:
entries.append(data)
except Exception:
continue
return entries
def summarize_human(human):
pass
def write_daily_and_human_summaries(all_outputs_map, summary_base_dir: Path):
# all_outputs_map: { persona: { day: [json_path, ...] } }
summary_base_dir.mkdir(parents=True, exist_ok=True)
for persona, days in all_outputs_map.items():
persona_dir = summary_base_dir / persona
persona_dir.mkdir(parents=True, exist_ok=True)
human_aggregate = []
for day, json_paths in days.items():
day_entries = []
for jp in json_paths:
try:
with open(jp, 'r', encoding='utf-8') as f:
day_entries.extend(json.load(f))
except Exception:
continue
# write daily summary
daily_summary = summarize_day(persona, day)
#daily_summary.update({'persona': persona, 'day': day})
with open(persona_dir / f'{persona}_{day}.json', 'w', encoding='utf-8') as f:
json.dump(daily_summary, f, ensure_ascii=False, indent=2)
human_aggregate.extend(day_entries)
# write human summary across all 7 days
human_summary = human_aggregate #summarize_day(human_aggregate)
#human_summary.update({'persona': persona, 'days': sorted(days.keys())})
with open(summary_base_dir / f'{persona}.json', 'w', encoding='utf-8') as f:
json.dump(human_summary, f, ensure_ascii=False, indent=2)
def main():
import argparse
parser = argparse.ArgumentParser(description='Convert SRT captions to JSON with absolute times and video paths')
parser.add_argument('--srt', type=str, required=True, help='Path to a .srt file or a directory containing .srt files')
parser.add_argument('--out', type=str, default=str(Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption')),
help='Output base directory for JSON files (will mirror persona/date subfolders)')
parser.add_argument('--video-dir', type=str, default=None, help='Directory containing the corresponding videos (e.g., /mnt/data/raw_data/EgoLife/A1_JAKE/DAY1)')
parser.add_argument('--recursive', action='store_true', help='Recursively process A1_JAKE..A6_SHURE and DAY1..DAY7 under the given SRT root')
parser.add_argument('--summaries-out', type=str, default=str(Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption')),
help='Output base directory for daily and human summaries')
args = parser.parse_args()
srt_path = Path(args.srt)
out_dir = Path(args.out)
video_dir = Path(args.video_dir) if args.video_dir else None
summaries_out = Path(args.summaries_out)
outputs = []
def target_out_dir_for(file_path: Path) -> Path:
# Mirror subfolders: <base>/DenseCaption/<persona>/<date>/
name = file_path.stem
parts = name.split('_')
persona = '_'.join(parts[:2]) if len(parts) >= 2 else 'UNKNOWN'
date = parts[2] if len(parts) >= 3 else 'DAYX'
return out_dir / persona / date
all_outputs_map = {}
if args.recursive and srt_path.is_dir():
# Expect structure: <root>/<persona>/<DAYn>/*.srt
for persona_dir in sorted([d for d in srt_path.iterdir() if d.is_dir()]):
persona = persona_dir.name
all_outputs_map.setdefault(persona, {})
for day_dir in sorted([d for d in persona_dir.iterdir() if d.is_dir()]):
day = day_dir.name
all_outputs_map[persona].setdefault(day, [])
# video dir corresponding
vdir = Path('/mnt/data/raw_data/EgoLife') / persona / day
for p in sorted(day_dir.glob('*.srt')):
outp = convert_srt_to_json(p, out_dir / persona / day, vdir)
outputs.append(outp)
all_outputs_map[persona][day].append(outp)
# write summaries
write_daily_and_human_summaries(all_outputs_map, summaries_out)
else:
if srt_path.is_dir():
for p in srt_path.rglob('*.srt'):
outp = convert_srt_to_json(p, target_out_dir_for(p), video_dir)
outputs.append(outp)
else:
outp = convert_srt_to_json(srt_path, target_out_dir_for(srt_path), video_dir)
outputs.append(outp)
print('Wrote:', *map(str, outputs), sep='\n')
if name == 'main':
main()
`
https://github.com/EvolvingLMMs-Lab/EgoLife/tree/main/EgoRAG#method-1-create-from-json-fast
Does the "captions for each video" mentioned here means the srt files? But there are too many captions, and ChromaDB build really slow. Building indexes for A1_JAKE takes 3 days. There might be some simpler version of captions I think?
The following is my srt to json script
`
import os
import re
import json
from pathlib import Path
SRT_TIMESTAMP_RE = re.compile(r"^(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-->\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})$")
def hms_to_ms(h: int, m: int, s: int, ms: int) -> int:
return ((h * 60 + m) * 60 + s) * 1000 + ms
def hms_to_8digit(h: int, m: int, s: int, ms: int) -> int:
return h * 1000000 + m * 10000 + s * 100 + ms // 10
def parse_srt(path: Path):
entries = []
with open(path, 'r', encoding='utf-8') as f:
lines = [line.rstrip('\n') for line in f]
def extract_video_start_ms(filename: str) -> int:
# Find the last continuous digits in filename before .mp4
base = os.path.splitext(os.path.basename(filename))[0]
nums = re.findall(r"(\d{6,})", base)
return int(nums[-1]) if nums else -1
def extract_video_start_8digit(filename: str) -> int:
# Find the last continuous 8-digit number in filename before .mp4
base = os.path.splitext(os.path.basename(filename))[0]
nums = re.findall(r"(\d{8})", base)
return int(nums[-1]) if nums else -1
def convert_srt_to_json(srt_file: Path, output_dir: Path, video_dir: Path = None):
# Expect filename like A1_JAKE_DAY1_11000000.srt
name = srt_file.stem # e.g., A1_JAKE_DAY1_11000000
parts = name.split('')
if len(parts) < 3:
raise ValueError(f'Unexpected SRT filename format: {srt_file.name}')
persona = ''.join(parts[:2]) # A1_JAKE
date = parts[2] # DAY1
try:
base_ms = int(parts[3]) if len(parts) > 3 else 0
except ValueError:
base_ms = 0
def summarize_day(persona, date=None):
# persona may be a list of entries (as used elsewhere) or a persona string with a date to read files
if isinstance(persona, list):
entries = persona
else:
if date is None:
return {'captions': [], 'count': 0}
folder = Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption') / persona / date
entries = []
if folder.is_dir():
for jf in sorted(folder.glob('*.json')):
try:
with open(jf, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
entries.extend(data)
elif isinstance(data, dict):
# handle possible wrapped structure
if 'captions' in data and isinstance(data['captions'], list):
entries.extend(data['captions'])
else:
entries.append(data)
except Exception:
continue
return entries
def summarize_human(human):
pass
def write_daily_and_human_summaries(all_outputs_map, summary_base_dir: Path):
# all_outputs_map: { persona: { day: [json_path, ...] } }
summary_base_dir.mkdir(parents=True, exist_ok=True)
for persona, days in all_outputs_map.items():
persona_dir = summary_base_dir / persona
persona_dir.mkdir(parents=True, exist_ok=True)
human_aggregate = []
for day, json_paths in days.items():
day_entries = []
for jp in json_paths:
try:
with open(jp, 'r', encoding='utf-8') as f:
day_entries.extend(json.load(f))
except Exception:
continue
# write daily summary
daily_summary = summarize_day(persona, day)
#daily_summary.update({'persona': persona, 'day': day})
with open(persona_dir / f'{persona}_{day}.json', 'w', encoding='utf-8') as f:
json.dump(daily_summary, f, ensure_ascii=False, indent=2)
human_aggregate.extend(day_entries)
# write human summary across all 7 days
human_summary = human_aggregate #summarize_day(human_aggregate)
#human_summary.update({'persona': persona, 'days': sorted(days.keys())})
with open(summary_base_dir / f'{persona}.json', 'w', encoding='utf-8') as f:
json.dump(human_summary, f, ensure_ascii=False, indent=2)
def main():
import argparse
parser = argparse.ArgumentParser(description='Convert SRT captions to JSON with absolute times and video paths')
parser.add_argument('--srt', type=str, required=True, help='Path to a .srt file or a directory containing .srt files')
parser.add_argument('--out', type=str, default=str(Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption')),
help='Output base directory for JSON files (will mirror persona/date subfolders)')
parser.add_argument('--video-dir', type=str, default=None, help='Directory containing the corresponding videos (e.g., /mnt/data/raw_data/EgoLife/A1_JAKE/DAY1)')
parser.add_argument('--recursive', action='store_true', help='Recursively process A1_JAKE..A6_SHURE and DAY1..DAY7 under the given SRT root')
parser.add_argument('--summaries-out', type=str, default=str(Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption')),
help='Output base directory for daily and human summaries')
args = parser.parse_args()
if name == 'main':
main()
`