diff --git a/requirements.txt b/requirements.txt index a866577..4e44d28 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ +ollama yt_dlp diff --git a/summarize.py b/summarize.py index 77529b2..867585c 100755 --- a/summarize.py +++ b/summarize.py @@ -1,2 +1,101 @@ #! /usr/bin/env python3 +import io +import json +import yt_dlp +from ollama import chat, ChatResponse, Client + +#video_url = 'https://youtu.be/jl4HOY8ZaEA' +video_url = 'https://www.youtube.com/watch?v=kTctVqjhDEw' + +ydl_opts = { + 'writesubtitles': True, # Enable downloading subtitles + 'subtitleslangs': ['en'], # Specify subtitle language(s) + 'skip_download': True, # Skip downloading the video itself + 'outtmpl': '-', # Use '-' to avoid writing to a file + 'quiet': True, # Suppress console output + 'format': 'bestaudio/best', # Minimal format setting for metadata extraction + 'writeinfojson': True # Store metadata, including subtitle URLs +} + +ol_client = Client( + host='http://localhost:11434' +) + +def get_summary(subtitles): + """ + Gets a summary from a local ollama installation given a string with subtitles in it. + + Args: + subtitles (str): A string with subs + + Returns: + str: A string with the AI's response in it. + """ + response : ChatResponse = ol_client.chat(model='frowning/llama3-nymeria:15b-q6_k', messages=[ + { + 'role': 'system', + 'content': 'Your job is to summarize YouTube videos given a (potentially auto-generated) transcript. Summarize the video, cutting out sponsor segments and advertisements. Include all core points in the video. Be as detailed as possible.' + }, + { + 'role': 'user', + 'content': 'Please summarize this video: ' + str(subtitles) + } + ]) + + return(response['message']['content']) + +def concatenate_subtitles(subtitle_json): + """ + Concatenates all subtitle text from the given JSON object. + + Args: + subtitle_json (dict): A dictionary containing subtitle data. + + Returns: + str: A single string with all concatenated subtitle text. + """ + result = [] + + # Check if the 'events' key is in the JSON object + if 'events' in subtitle_json: + for event in subtitle_json['events']: + # Check if 'segs' is in the event and concatenate 'utf8' text from each segment + if 'segs' in event: + for seg in event['segs']: + if 'utf8' in seg: + result.append(seg['utf8']) + + # Join all collected text with spaces and return + return ' '.join(result) + +with yt_dlp.YoutubeDL(ydl_opts) as ydl: + # Extract metadata without downloading the video + info = ydl.extract_info(video_url, download=False) + + # Check if subtitles are available + subtitle_lang = 'en' # Change this to your desired language + subtitles_available = info.get('subtitles', {}) + automatic_subtitles_available = info.get('automatic_captions', {}) + + if subtitle_lang in subtitles_available: + print(f"Downloading manual subtitles for language: {subtitle_lang}") + subtitle_url = subtitles_available[subtitle_lang][0]['url'] + elif subtitle_lang in automatic_subtitles_available: + print(f"No manual subtitles available. Falling back to auto-generated subtitles for language: {subtitle_lang}") + subtitle_url = automatic_subtitles_available[subtitle_lang][0]['url'] + else: + print(f"No subtitles (manual or auto-generated) available for language: {subtitle_lang}") + subtitle_url = None + + # If a subtitle URL was found, download the subtitles + if subtitle_url: + subtitle_data = ydl.urlopen(subtitle_url).read().decode('utf-8') + else: + print("Failed to download subtitles.") + exit(50) + + subs = concatenate_subtitles(json.loads(subtitle_data)) + print("Getting summary...") + summary = get_summary(subs) + print(summary)