Skip to content

Is ”captions for each video" the srt files in "EgoLife/EgoLifeCap/DenseCaption/..."? Still too slow to build. 60 hours for A1_JAKE #16

Description

@Philip-2000

https://github.com/EvolvingLMMs-Lab/EgoLife/tree/main/EgoRAG#method-1-create-from-json-fast
Does the "captions for each video" mentioned here means the srt files? But there are too many captions, and ChromaDB build really slow. Building indexes for A1_JAKE takes 3 days. There might be some simpler version of captions I think?

The following is my srt to json script

`
import os
import re
import json
from pathlib import Path

SRT_TIMESTAMP_RE = re.compile(r"^(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-->\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})$")

def hms_to_ms(h: int, m: int, s: int, ms: int) -> int:
return ((h * 60 + m) * 60 + s) * 1000 + ms

def hms_to_8digit(h: int, m: int, s: int, ms: int) -> int:
return h * 1000000 + m * 10000 + s * 100 + ms // 10

def parse_srt(path: Path):
entries = []
with open(path, 'r', encoding='utf-8') as f:
lines = [line.rstrip('\n') for line in f]

i = 0
while i < len(lines):
    # Skip index line (number)
    if lines[i].strip().isdigit():
        i += 1
    if i >= len(lines):
        break
    # Timestamp line
    m = SRT_TIMESTAMP_RE.match(lines[i].strip())
    if not m:
        i += 1
        continue
    i += 1
    sh, sm, ss, sms, eh, em, es, ems = map(int, m.groups())
    # Caption text lines until blank
    text_lines = []
    while i < len(lines) and lines[i].strip() != '':
        text_lines.append(lines[i].strip())
        i += 1
    # consume blank line
    while i < len(lines) and lines[i].strip() == '':
        i += 1
    entries.append({
        'start_8digit': hms_to_8digit(sh, sm, ss, sms),
        'end_8digit': hms_to_8digit(eh, em, es, ems),
        'text': ' '.join(text_lines),
    })
return entries

def extract_video_start_ms(filename: str) -> int:
# Find the last continuous digits in filename before .mp4
base = os.path.splitext(os.path.basename(filename))[0]
nums = re.findall(r"(\d{6,})", base)
return int(nums[-1]) if nums else -1

def extract_video_start_8digit(filename: str) -> int:
# Find the last continuous 8-digit number in filename before .mp4
base = os.path.splitext(os.path.basename(filename))[0]
nums = re.findall(r"(\d{8})", base)
return int(nums[-1]) if nums else -1

def convert_srt_to_json(srt_file: Path, output_dir: Path, video_dir: Path = None):
# Expect filename like A1_JAKE_DAY1_11000000.srt
name = srt_file.stem # e.g., A1_JAKE_DAY1_11000000
parts = name.split('')
if len(parts) < 3:
raise ValueError(f'Unexpected SRT filename format: {srt_file.name}')
persona = '
'.join(parts[:2]) # A1_JAKE
date = parts[2] # DAY1
try:
base_ms = int(parts[3]) if len(parts) > 3 else 0
except ValueError:
base_ms = 0

cues = parse_srt(srt_file)
# Prepare video index from the provided video_dir (e.g., /mnt/data/raw_data/EgoLife/A1_JAKE/DAY1)
video_index = []
if video_dir and Path(video_dir).is_dir():
    for vf in Path(video_dir).glob('*.mp4'):
        start_8digit = extract_video_start_8digit(vf.name)
        if start_8digit >= 0:
            video_index.append((start_8digit, vf.name))
    video_index.sort(key=lambda x: x[0])

def locate_video_filename(ts_8digit: int) -> str:
    if not video_index:
        # Fallback to constructed name if index missing
        return f"{date}_{persona}_{ts_8digit}.mp4"
    # find last video whose start_8digit <= ts_8digit
    chosen = video_index[0][1]
    for start_8digit, name in video_index:
        if start_8digit <= ts_8digit:
            chosen = name
        else:
            break
    return chosen

result = []
for cue in cues:
    start_time = str(base_ms + cue['start_8digit'])
    end_time = str(base_ms + cue['end_8digit'])
    # Choose the actual video file containing the start_time
    video_filename = locate_video_filename(int(start_time))
    video_path = f"/mnt/data/raw_data/EgoLife/{persona}/{date}/{video_filename}"
    result.append({
        'start_time': start_time,
        'end_time': end_time,
        'text': cue['text'],
        'date': date,
        'video_path': video_path,
    })

output_dir.mkdir(parents=True, exist_ok=True)
out_file = output_dir / (name + '.json')
with open(out_file, 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=2)
return out_file

def summarize_day(persona, date=None):
# persona may be a list of entries (as used elsewhere) or a persona string with a date to read files
if isinstance(persona, list):
entries = persona
else:
if date is None:
return {'captions': [], 'count': 0}
folder = Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption') / persona / date
entries = []
if folder.is_dir():
for jf in sorted(folder.glob('*.json')):
try:
with open(jf, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
entries.extend(data)
elif isinstance(data, dict):
# handle possible wrapped structure
if 'captions' in data and isinstance(data['captions'], list):
entries.extend(data['captions'])
else:
entries.append(data)
except Exception:
continue
return entries

def summarize_human(human):
pass

def write_daily_and_human_summaries(all_outputs_map, summary_base_dir: Path):
# all_outputs_map: { persona: { day: [json_path, ...] } }
summary_base_dir.mkdir(parents=True, exist_ok=True)
for persona, days in all_outputs_map.items():
persona_dir = summary_base_dir / persona
persona_dir.mkdir(parents=True, exist_ok=True)
human_aggregate = []
for day, json_paths in days.items():
day_entries = []
for jp in json_paths:
try:
with open(jp, 'r', encoding='utf-8') as f:
day_entries.extend(json.load(f))
except Exception:
continue
# write daily summary
daily_summary = summarize_day(persona, day)
#daily_summary.update({'persona': persona, 'day': day})
with open(persona_dir / f'{persona}_{day}.json', 'w', encoding='utf-8') as f:
json.dump(daily_summary, f, ensure_ascii=False, indent=2)
human_aggregate.extend(day_entries)
# write human summary across all 7 days
human_summary = human_aggregate #summarize_day(human_aggregate)
#human_summary.update({'persona': persona, 'days': sorted(days.keys())})
with open(summary_base_dir / f'{persona}.json', 'w', encoding='utf-8') as f:
json.dump(human_summary, f, ensure_ascii=False, indent=2)

def main():
import argparse
parser = argparse.ArgumentParser(description='Convert SRT captions to JSON with absolute times and video paths')
parser.add_argument('--srt', type=str, required=True, help='Path to a .srt file or a directory containing .srt files')
parser.add_argument('--out', type=str, default=str(Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption')),
help='Output base directory for JSON files (will mirror persona/date subfolders)')
parser.add_argument('--video-dir', type=str, default=None, help='Directory containing the corresponding videos (e.g., /mnt/data/raw_data/EgoLife/A1_JAKE/DAY1)')
parser.add_argument('--recursive', action='store_true', help='Recursively process A1_JAKE..A6_SHURE and DAY1..DAY7 under the given SRT root')
parser.add_argument('--summaries-out', type=str, default=str(Path('/mnt/data/raw_data/EgoLife/EgoLifeCapJSON/DenseCaption')),
help='Output base directory for daily and human summaries')
args = parser.parse_args()

srt_path = Path(args.srt)
out_dir = Path(args.out)
video_dir = Path(args.video_dir) if args.video_dir else None
summaries_out = Path(args.summaries_out)

outputs = []
def target_out_dir_for(file_path: Path) -> Path:
    # Mirror subfolders: <base>/DenseCaption/<persona>/<date>/
    name = file_path.stem
    parts = name.split('_')
    persona = '_'.join(parts[:2]) if len(parts) >= 2 else 'UNKNOWN'
    date = parts[2] if len(parts) >= 3 else 'DAYX'
    return out_dir / persona / date

all_outputs_map = {}
if args.recursive and srt_path.is_dir():
    # Expect structure: <root>/<persona>/<DAYn>/*.srt
    for persona_dir in sorted([d for d in srt_path.iterdir() if d.is_dir()]):
        persona = persona_dir.name
        all_outputs_map.setdefault(persona, {})
        for day_dir in sorted([d for d in persona_dir.iterdir() if d.is_dir()]):
            day = day_dir.name
            all_outputs_map[persona].setdefault(day, [])
            # video dir corresponding
            vdir = Path('/mnt/data/raw_data/EgoLife') / persona / day
            for p in sorted(day_dir.glob('*.srt')):
                outp = convert_srt_to_json(p, out_dir / persona / day, vdir)
                outputs.append(outp)
                all_outputs_map[persona][day].append(outp)
    # write summaries
    write_daily_and_human_summaries(all_outputs_map, summaries_out)
else:
    if srt_path.is_dir():
        for p in srt_path.rglob('*.srt'):
            outp = convert_srt_to_json(p, target_out_dir_for(p), video_dir)
            outputs.append(outp)
    else:
        outp = convert_srt_to_json(srt_path, target_out_dir_for(srt_path), video_dir)
        outputs.append(outp)

print('Wrote:', *map(str, outputs), sep='\n')

if name == 'main':
main()
`

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions