| | import yt_dlp as youtube_dl |
| |
|
| |
|
| | def download_youtube_audio(url, output_path, preferred_quality="192"): |
| | ydl_opts = { |
| | 'format': 'bestaudio/best', |
| | 'postprocessors': [{ |
| | 'key': 'FFmpegExtractAudio', |
| | 'preferredcodec': 'mp3', |
| | 'preferredquality': preferred_quality, |
| | }], |
| | 'outtmpl': output_path, |
| | } |
| |
|
| | try: |
| | with youtube_dl.YoutubeDL(ydl_opts) as ydl: |
| | info_dict = ydl.extract_info(url, download=False) |
| | video_title = info_dict.get('title', None) |
| | print(f"Downloading audio for: {video_title}") |
| |
|
| | ydl.download([url]) |
| | print(f"Audio file saved as: {output_path}") |
| |
|
| | return output_path |
| |
|
| | except youtube_dl.utils.DownloadError as e: |
| | print(f"Error downloading audio: {e}") |
| | return None |
| |
|
| |
|
| | def transcribe(path ,model): |
| | model = WhisperModel(model) |
| |
|
| | print(f"reading {path}") |
| | segments, info = model.transcribe(path) |
| | return segments |
| |
|
| |
|
| |
|
| | def process_segments(segments: Generator): |
| | result = {} |
| | print("processing...") |
| | for i, segment in enumerate(segments): |
| | chunk_id = f"chunk_{i}" |
| | |
| | result[chunk_id] = { |
| | 'chunk_id': segment.id, |
| | 'chunk_length': segment.end - segment.start, |
| | 'text': segment.text, |
| | 'start_time': segment.start, |
| | 'end_time': segment.end |
| | } |
| | df = pd.DataFrame.from_dict(result, orient='index') |
| |
|
| | return df |
| |
|
| |
|
| |
|
| | def gen_csv(): |
| | df = process_segments(transcribe(download_youtube_audio("https://www.youtube.com/watch?v=Sby1uJ_NFIY", path), "distil-large-v3")) |
| | df.to_csv('alo.csv') |
| |
|
| | |