How to extract metadata for more than 20000 videos from channel using YouTube Data API v3?

The solution to this problem can either be forcing the API to extract metadata more than 20000 videos from a channel or specifying a time period during which video was uploaded. That way, the code can be run again and again for multiple time periods to extract metadata for all videos.

I tried this without success.

My solution to failing YouTube API backend is using this Python script:

It consists in faking requests done when browsing “Videos” tab on a YouTube channel.

import urllib.request, json, subprocess
from urllib.error import HTTPError

def getURL(url):
    res = ""
    try:
        res = urllib.request.urlopen(url).read()
    except HTTPError as e:
        res = e.read()
    return res.decode('utf-8')

def exec(cmd):
    return subprocess.check_output(cmd, shell = True)

youtuberId = 'CHANNEL_ID'
videosIds = []
errorsCount = 0

def retrieveVideosFromContent(content):
    global videosIds
    wantedPattern = '"videoId":"'
    content = content.replace('"videoId": "', wantedPattern).replace("'videoId': '", wantedPattern)
    contentParts = content.split(wantedPattern)
    contentPartsLen = len(contentParts)
    for contentPartsIndex in range(contentPartsLen):
        contentPart = contentParts[contentPartsIndex]
        contentPartParts = contentPart.split('"')
        videoId = contentPartParts[0]
        videoIdLen = len(videoId)
        if not videoId in videosIds and videoIdLen == 11:
            videosIds += [videoId]

def scrape(token):
    global errorsCount, data
    # YOUR_KEY can be obtained by browsing a videos channel section (like https://www.youtube.com/c/BenjaminLoison/videos) while checking your "Network" tab using for instance Ctrl+Shift+E
    cmd = 'curl -s \'https://www.youtube.com/youtubei/v1/browse?key=YOUR_KEY\' -H \'Content-Type: application/json\' --data-raw \'{"context":{"client":{"clientName":"WEB","clientVersion":"2.20210903.05.01"}},"continuation":"' + token + '"}\''
    cmd = cmd.replace('"', '\\"').replace("\'", '"')
    content = exec(cmd).decode('utf-8')

    retrieveVideosFromContent(content)

    data = json.loads(content)
    if not 'onResponseReceivedActions' in data:
        print('no token found let\'s try again')
        errorsCount += 1
        return scrape(token)
    entry = data['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'][-1]
    if not 'continuationItemRenderer' in entry:
        return ''
    newToken = entry['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
    return newToken

url="https://www.youtube.com/channel/" + youtuberId + '/videos'
content = getURL(url)
content = content.split('var ytInitialData=")[1].split(";</script>")[0]
dataFirst = json.loads(content)

retrieveVideosFromContent(content)

token = dataFirst["contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['gridRenderer']['items'][-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']

while True:
    videosIdsLen = len(videosIds)
    print(videosIdsLen, token)
    if token == '':
        break
    newToken = scrape(token)
    token = newToken

print(videosIdsLen, videosIds)

Pay attention to modify CHANNEL_ID and YOUR_KEY values. Also pay attention to have curl command available from your shell.

More Related Contents:

Leave a Comment Cancel reply