transcribe/transcribe.py at main · atsyplikhin/transcribe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import sys
from pathlib import Path
from pydub import AudioSegment
from openai import OpenAI

# Check if the required file path is provided
if len(sys.argv) < 2:
    print("Usage: python script.py <path to audio file> [<language>] [<speaker names>]")
    sys.exit(1)

# Extract optional arguments
input_file_path = sys.argv[1]
language = sys.argv[2] if len(sys.argv) >= 3 else "en"
speaker_names = f"The speakers are {sys.argv[3]}." if len(sys.argv) >= 4 else ""

# Verify input file path
input_path = Path(input_file_path)
if not input_path.is_file():
    print(f"Error: The file {input_file_path} does not exist.")
    sys.exit(1)

# Initialize OpenAI client
client = OpenAI()

# Read the audio file
audio = AudioSegment.from_file(input_file_path)

chunk_length_ms = 10 * 60 * 1000  # 10 minutes
overlap_ms = 5 * 1000  # 5 seconds

i, current_offset = 0, 0
transcription_file_path = input_path.parent / f"{input_path.stem}_transcript.txt"

# Temporary file for transcription
tmp_fname = "$$tmp_audio_for_transcription$$.mp3"

# Open the output file for writing
with transcription_file_path.open("w") as out_file:
    while True:
        portion = audio[current_offset:(current_offset + chunk_length_ms)]
        if len(portion) == 0:
            break

        portion.export(tmp_fname, format="mp3")

        with open(tmp_fname, "rb") as audio_file:
            transcription = client.audio.transcriptions.create(
                model="whisper-1",
                file=audio_file,
                language=language
            )
            line = f"Transcription portion {i + 1}\n{transcription.text}\n\n"
            out_file.write(line)
            print(line)

        # Move to the next chunk
        i += 1
        current_offset += chunk_length_ms - overlap_ms

    prompt = f"""---
You are a helpful assistant. Your task is to correct any spelling discrepancies in
the transcribed text above, combine portions, split with new lines when speaker or topic appear to changes.
Remove filler words such as okay, right, you know, kind of, like, really, you know, well, and others.
Do not remove phrases otherwise, keep the whole meaning.
Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided.
{speaker_names}
The format must be as follows:
**Speaker 1 Name**: Hello.

**Speaker 2 Name**: Hello.

**Speaker 1 Name**: How are you?"""
    out_file.write(prompt)

# Delete the temporary file
os.remove(tmp_fname)

print(f"Transcription completed. Output file located at: {transcription_file_path}")