3d2a-index/odyseescraper/odysee.py

#! /usr/bin/env python3
import json
import requests
import time

odysee_url = r'https://odysee.com'
odysee_api_url = r'https://api.na-backend.odysee.com/api/v1/proxy'

def odysee_get_channel_url(handle):
    return f'{odysee_url}/{handle}'

def odysee_get_releases(handle):
    releases = {}
    try:
        for i in range(1,20):
            payload = {
                "method": "claim_search",
                "params": {
                    "channel": handle,
                    "page_size": 20,
                    "page": i
                }
            }
            response = requests.post(odysee_api_url, json=payload)
            response.raise_for_status()
            data = response.json()
            lastpage = data.get("result", {}).get("total_pages", 1)
            items = data.get("result", {}).get("items", [])
            for raw_item in items:
                item = raw_item
                # The value_type field can help us immediately whittle down chaff like reposts,
                # playlists, etc. By and large we only care about streams, I think, but I'm not
                # confident enough in my knowledge of the LBRY API to whitelist instead of
                # blacklist value_types.
                if item["value_type"] == "repost":
                    continue
                if item["value_type"] == "stream":
                    pass
                elif item["value_type"] == "collection":
                    # Collections are playlists, and we don't care about the ones that aren't
                    continue
                else:
                    print(f'Unknown value type, continuing: {item["value_type"]}')
                # A stream is data(?) in the form of a file, but we don't know what
                # So we should check to see what it is and ignore it if it's something dumb
                if item["value"].get("stream_type") == "video":
                    continue
                # If we can't hash the file, it's not a file we want
                if not item["value"].get("source", {}).get("hash"):
                    continue
                releases[item["claim_id"]] = {
                    # Fields with .strip() at the end are user-controlled and may mess with sorting if
                    # leading/trailing whitespace is left in.
                    "name": item.get("name", "Unnamed Release").strip(),
                    "title": item["value"].get("title", "Untitled Release").strip(),
                    # This field is an int in unixtime
                    "publishdate": int(item["value"].get("release_time", 0)),
                    "description": item["value"].get("description", "No description provided for this release").strip(),
                    "thumbnail": item["value"].get("thumbnail", {}).get("url", ""),
                    "url": f"{odysee_get_channel_url(handle)}/{item['name']}",
                    "filehash": item["value"].get("source", {}).get("hash", "")
                }
            if i == lastpage:
                break
            else:
                # If we're not on the last page, sleep for a second to be easier on Odysee
                # This isn't a proper wait limiter, but it's something.
                time.sleep(1)
    except requests.RequestException as e:
        print(f'RequestException occurred while getting releases for {handle}: {e}')
        return None
    except KeyError as e:
        print(f'KeyError occurred while getting releases for {handle}: {e}')
        print(f'Nonzero chance Odysee updated their API out from under you')
        return None
    return releases