Refine autogenned subs, split into smaller segments to get pre-summaries, and more adequately balance the context window

2025-01-08 14:40:45 -06:00
parent 54980ae706
commit 150d476739
1 changed files with 70 additions and 15 deletions
--- a/summarize.py
+++ b/summarize.py
@@ -5,6 +5,7 @@ import io
 import json
 import yt_dlp
 from ollama import chat, ChatResponse, Client
+from textwrap import wrap

 ydl_opts = {
    'writesubtitles': True,        # Enable downloading subtitles
@@ -20,24 +21,43 @@ ol_client = Client(
        host='http://localhost:11434'
 )

-def get_summary(subtitles):
-    """
-    Gets a summary from a local ollama installation given a string with subtitles in it.
-
-    Args:
-        subtitles (str): A string with subs
-
-    Returns:
-        str: A string with the AI's response in it.
-    """
+def refine(subtitles):
    response : ChatResponse = ol_client.chat(model='frowning/llama3-nymeria:15b-q6_k', messages=[
        {
            'role': 'system',
-            'content': 'Your job is to summarize YouTube videos given a (potentially auto-generated) transcript. Summarize the video, cutting out sponsor segments and advertisements. Include all core points in the video. Be as detailed as possible. Your response should be at least five paragraphs.'
+            'content': 'Your job is to refine auto-generated subtitles from YouTube. You will be given a snippet of a transcript of a YouTube video that may or may not split at a sentence boundary. You are to ONLY correct grammar and spelling mistakes with that transcript. If you encounter a "[ __ ]" segment, a swear has been redacted. Your text will be concatenated with other snippets, so it is important that you only spit back the corrected transcript and not any notes, headers, etc.'
        },
        {
            'role': 'user',
-            'content': 'Please summarize this video: ' + str(subtitles)
+            'content': str(subtitles)
+        }
+    ])
+
+    return(response['message']['content'])
+
+def get_pre_summary(subtitles):
+    response : ChatResponse = ol_client.chat(model='frowning/llama3-nymeria:15b-q6_k', messages=[
+        {
+            'role': 'system',
+            'content': 'Your job is to summarize a snippet of a YouTube video given a chunk of its transcript. Summarize the snippet to the best of your ability.'
+        },
+        {
+            'role': 'user',
+            'content': str(subtitles)
+        }
+    ])
+
+    return(response['message']['content'])
+
+def get_summary(subtitles):
+    response : ChatResponse = ol_client.chat(model='frowning/llama3-nymeria:15b-q6_k', messages=[
+        {
+            'role': 'system',
+            'content': 'Your job is to summarize YouTube videos given a series of summaries of snippets of the YouTube video. Given those snippets, summarize the YouTube video.'
+        },
+        {
+            'role': 'user',
+            'content': str(subtitles)
        }
    ])

@@ -72,6 +92,7 @@ def main():
            description="Download subtitles from a video and summarize it using a local Ollama instance."
    )
    parser.add_argument('url', metavar='URL', type=str, help="The URL of the video to process.")
+    parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose output.")

    # Parse out arguments
    args = parser.parse_args()
@@ -86,6 +107,7 @@ def main():
        subtitle_lang = 'en'  # Change this to your desired language
        subtitles_available = info.get('subtitles', {})
        automatic_subtitles_available = info.get('automatic_captions', {})
+        autogenned = False

        if subtitle_lang in subtitles_available:
            print(f"Downloading manual subtitles for language: {subtitle_lang}...")
@@ -93,6 +115,7 @@ def main():
        elif subtitle_lang in automatic_subtitles_available:
            print(f"No manual subtitles available. Falling back to auto-generated subtitles for language: {subtitle_lang}...")
            subtitle_url = automatic_subtitles_available[subtitle_lang][0]['url']
+            autogenned = True
        else:
            print(f"No subtitles (manual or auto-generated) available for language: {subtitle_lang}!")
            subtitle_url = None
@@ -106,8 +129,40 @@ def main():
            exit(50)

        subs = concatenate_subtitles(json.loads(subtitle_data))
-        print("Getting summary...")
-        summary = get_summary(subs)
-        print(summary)
+        # If we have auto-generated subtitles, refine them a bit:
+        if autogenned:
+            print("Refining transcript...")
+            buffer = ""
+            # We split this into smaller chunks to urge the AI to only do small pieces
+            chunked = wrap(subs, 2048)
+            print(f"Splitting text into {len(chunked)} segments...")
+            for snippet in chunked:
+                if args.verbose:
+                    print(f"Unrefined: {snippet}")
+                ref = refine(snippet)
+                if args.verbose:
+                    print(f"Refined: {ref}")
+                buffer += ref
+                if not args.verbose:
+                    print("#", end="")
+            subs = buffer
+        if args.verbose:
+            print(subs)
+        if args.verbose:
+            print("Getting summary...")
+        # Now chunk the subs up and get summaries of segments
+        firstpass = ""
+        chunked = wrap(subs, 4096)
+        print(f"Splitting text into {len(chunked)} segments...")
+        for snippet in chunked:
+            pre_summary = get_pre_summary(snippet)
+            if args.verbose:
+                print(f"Presummary: {pre_summary}")
+            firstpass += pre_summary
+            if not args.verbose:
+                print("#", end="")
+        # And a summary of the whole
+        summary = get_summary(firstpass)
+        print(f"Summary: {summary}")

 main()