From 223260fdd69a43125c905fd9b50006004a8ea63a Mon Sep 17 00:00:00 2001 From: vccalvin33 Date: Sat, 21 May 2022 03:01:37 +0700 Subject: [PATCH 1/2] modify comments.py, add video_stats, add video_ids, fix some error issues --- utils/comments.py | 17 +++- yt_public.py | 247 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 207 insertions(+), 57 deletions(-) diff --git a/utils/comments.py b/utils/comments.py index b612b64..782f345 100644 --- a/utils/comments.py +++ b/utils/comments.py @@ -1,10 +1,11 @@ import csv from datetime import datetime as dt -comments = [] today = dt.today().strftime('%d-%m-%Y') +PATH = 'commentsFolder/' def process_comments(response_items, csv_output=False): + comments = [] for res in response_items: @@ -29,13 +30,19 @@ def process_comments(response_items, csv_output=False): return comments -def make_csv(comments, channelID=None): +def make_csv(comments, channelID=None, videoID=None): + # Handle 0 comments issue + if len(comments) == 0: + return + header = comments[0].keys() - if channelID: - filename = f'comments_{channelID}_{today}.csv' + if channelID and videoID: + filename = f'{PATH}comments_{channelID}_{videoID}_{today}.csv' + elif channelID: + filename = f'{PATH}comments_{channelID}_{today}.csv' else: - filename = f'comments_{today}.csv' + filename = f'{PATH}comments_{today}.csv' with open(filename, 'w', encoding='utf8', newline='') as f: writer = csv.DictWriter(f, fieldnames=header) diff --git a/yt_public.py b/yt_public.py index 3fda378..069ca12 100644 --- a/yt_public.py +++ b/yt_public.py @@ -1,15 +1,28 @@ import os +import csv +from datetime import datetime as dt +from urllib import response from dotenv import load_dotenv from googleapiclient.discovery import build from utils.comments import process_comments, make_csv load_dotenv() -API_KEY = os.getenv("API_KEY") +API_KEY_1 = os.getenv("API_KEY_1") +API_KEY_2 = os.getenv("API_KEY_2") +API_KEY_3 = os.getenv("API_KEY_3") +API_KEY_4 = os.getenv("API_KEY_4") +API_KEY_5 = os.getenv("API_KEY_5") -youtube = build("youtube", "v3", developerKey=API_KEY) +youtube_1 = build("youtube", "v3", developerKey=API_KEY_1) +youtube_2 = build("youtube", "v3", developerKey=API_KEY_2) +youtube_3 = build("youtube", "v3", developerKey=API_KEY_3) +youtube_4 = build("youtube", "v3", developerKey=API_KEY_4) +youtube_5 = build("youtube", "v3", developerKey=API_KEY_5) -def search_result(query): +scraped_videos = {} + +def search_result(youtube, query): """ Refer to the documentation: https://googleapis.github.io/google-api-python-client/docs/dyn/youtube_v3.search.html """ @@ -21,87 +34,217 @@ def search_result(query): return request.execute() -def channel_stats(channelID): +def get_video_ids(youtube, channelId): """ - Refer to the documentation: https://googleapis.github.io/google-api-python-client/docs/dyn/youtube_v3.channels.html + Refer to the documentation: https://googleapis.github.io/google-api-python-client/docs/dyn/youtube_v3.search.html """ + videoIds = [] + request = youtube.channels().list( - part="statistics", - id=channelID + part="contentDetails", + id=channelId ) - return request.execute() -def comment_threads(channelID, to_csv=False): - - comments_list = [] - - request = youtube.commentThreads().list( - part='id,replies,snippet', - videoId=channelID, + response = request.execute() + + playlistId = response['items'][0]['contentDetails']['relatedPlaylists']['uploads'] + + request = youtube.playlistItems().list( + part="contentDetails", + playlistId=playlistId, + maxResults=50 ) + response = request.execute() - comments_list.extend(process_comments(response['items'])) + responseItems = response['items'] + + videoIds.extend([item['contentDetails']['videoId'] for item in responseItems]) # if there is nextPageToken, then keep calling the API while response.get('nextPageToken', None): - request = youtube.commentThreads().list( - part='id,replies,snippet', - videoId=channelID, + print(f'Fetching next page of videos for {channelId}_{playlistId}') + request = youtube.playlistItems().list( + part="contentDetails", + playlistId=playlistId, + maxResults=50, pageToken=response['nextPageToken'] ) response = request.execute() - comments_list.extend(process_comments(response['items'])) + responseItems = response['items'] + + videoIds.extend([item['contentDetails']['videoId'] for item in responseItems]) - print(f"Finished fetching comments for {channelID}. {len(comments_list)} comments found.") + print(f"Finished fetching videoIds for {channelId}. {len(videoIds)} videos found.") + + return videoIds + +def channel_stats(youtube, channelIDs, to_csv=False): + """ + Refer to the documentation: https://googleapis.github.io/google-api-python-client/docs/dyn/youtube_v3.channels.html + """ + if type(channelIDs) == str: + channelIDs = [channelIDs] + + stats_list = [] + + for channelId in channelIDs: + request = youtube.channels().list( + part="statistics", + id=channelId + ) + response = request.execute() + response = response['items'][0]['statistics'] + response['channelId'] = channelId + + stats_list.append(response) + + if to_csv: + header = stats_list[0].keys() + with open(f'channelStats.csv', 'w') as f: + writer = csv.DictWriter(f, fieldnames=header) + writer.writeheader() + writer.writerows(stats_list) + + return stats_list + +def video_stats(youtube, videoIDs, channelID, to_csv=False): + if type(videoIDs) == str: + videoIDs = [videoIDs] + + stats_list = [] + + for videoId in videoIDs: + request = youtube.videos().list( + part="snippet, statistics, contentDetails", + id=videoId + ) + response = request.execute() + statistics = response['items'][0]['statistics'] + snippet = response['items'][0]['snippet'] + statistics['videoId'] = videoId + statistics['title'] = snippet['title'] + statistics['description'] = snippet['description'] + statistics['publishedAt'] = snippet['publishedAt'] + statistics['duration'] = response['items'][0]['contentDetails']['duration'] + statistics['thumbnail'] = snippet['thumbnails']['high']['url'] + statistics['channelId'] = channelID + + if statistics.get('likeCount', None) == None: + statistics['likeCount'] = 0 + + print(f"Fetched stats for {videoId}") + stats_list.append(statistics) if to_csv: - make_csv(comments_list, channelID) + header = stats_list[0].keys() + with open(f'videosFolder/videoStats_{channelID}.csv', 'w', encoding='utf8', newline='') as f: + writer = csv.DictWriter(f, fieldnames=header) + writer.writeheader() + writer.writerows(stats_list) - return comments_list + print(f'Success in fetching video stats for {channelID}') + return stats_list -def get_video_ids(channelId): - """ - Refer to the documentation: https://googleapis.github.io/google-api-python-client/docs/dyn/youtube_v3.search.html - """ - videoIds = [] - - request = youtube.search().list( - part="snippet", - channelId=channelId, - type="video", - maxResults=50, - order="date" - ) - response = request.execute() - responseItems = response['items'] +def comment_threads(youtube, videoID, channelID=None, to_csv=False): + + comments_list = [] + + try: + request = youtube.commentThreads().list( + part='id,replies,snippet', + videoId=videoID, + ) + response = request.execute() + except Exception as e: + print(f'Error fetching comments for {videoID} - error: {e}') + if scraped_videos.get('error_ids', None): + scraped_videos['error_ids'].append(videoID) + else: + scraped_videos['error_ids'] = [videoID] + return - videoIds.extend([item['id']['videoId'] for item in responseItems if item['id'].get('videoId', None) != None]) + comments_list.extend(process_comments(response['items'])) # if there is nextPageToken, then keep calling the API while response.get('nextPageToken', None): - request = youtube.search().list( - part="snippet", - channelId=channelId, + request = youtube.commentThreads().list( + part='id,replies,snippet', + videoId=videoID, + pageToken=response['nextPageToken'] ) response = request.execute() - responseItems = response['items'] + comments_list.extend(process_comments(response['items'])) + + print(f"Finished fetching comments for {videoID}. {len(comments_list)} comments found.") + + if to_csv: + try: + make_csv(comments_list, channelID, videoID) + except Exception as e: + print(f'Error writing comments to csv for {videoID} - error: {e}') + if scraped_videos.get('error_csv_ids', None): + scraped_videos['error_csv_ids'].append(videoID) + else: + scraped_videos['error_csv_ids'] = [videoID] + return + + if scraped_videos.get(channelID, None): + scraped_videos[channelID].append(videoID) + else: + scraped_videos[channelID] = [videoID] + + return comments_list - videoIds.extend([item['id']['videoId'] for item in responseItems if item['id'].get('videoId', None) != None]) +if __name__ == '__main__': + pyscriptVidId = 'Qo8dXyKXyME' + channelId = 'UCzIxc8Vg53_ewaRIk3shBug' + + channelIds = [] + # with open('youtube.csv', 'r') as csvfile: + # reader = csv.reader(csvfile) + # next(reader) + # for row in reader: + # channelIds.append(row[2]) + + # channel_stats(youtube_2, channelIds, to_csv=True) + + # videoDict = {} + # for idx in range(len(channelIds)): + # if idx <= len(channelIds)/5: + # youtube = youtube_1 + # elif idx <= len(channelIds)/5*2: + # youtube = youtube_2 + # elif idx <= len(channelIds)/5*3: + # youtube = youtube_3 + # elif idx <= len(channelIds)/5*4: + # youtube = youtube_4 + # else: + # youtube = youtube_5 + # videoIds = get_video_ids(youtube, channelIds[idx]) + # videoDict[channelIds[idx]] = videoIds - print(f"Finished fetching videoIds for {channelId}. {len(videoIds)} videos found.") + import json + # with open('videoDict.json', 'w') as fp: + # json.dump(videoDict, fp) - return videoIds + with open('videoDict.json', 'r') as fp: + videoDict = json.load(fp) + for channelId, videoIds in videoDict.items(): + video_stats(youtube_3, videoIDs=videoIds, channelID=channelId, to_csv=True) + # for channelId, videoIds in videoDict.items(): + # for videoId in videoIds: + # comment_threads(youtube_3, videoID=videoId, channelID=channelId, to_csv=True) -if __name__ == '__main__': - pyscriptVidId = 'Qo8dXyKXyME' - channelId = 'UCzIxc8Vg53_ewaRIk3shBug' + # with open('scrapedVideos.json', 'w') as fp: + # json.dump(scraped_videos, fp) # response = search_result("pyscript") - response = channel_stats(channelId) - # response = comment_threads(pyscriptVidId, to_csv=True) + # response = channel_stats(youtube_2, channelId) + # response = comment_threads(youtube_2, videoID='pnecPXlfR5U', to_csv=False) + # print(response) - print(response) + #NOTES -> troublesome video (videoId: CqssnS_v1a4) -> no likecount, so I manually add 0 to it \ No newline at end of file From fc9d9d93be02849b884e2bf238fca2abadd87caf Mon Sep 17 00:00:00 2001 From: vccalvin33 Date: Sat, 21 May 2022 15:56:40 +0700 Subject: [PATCH 2/2] make videoStats func cleaner --- yt_public.py | 54 ++-------------------------------------------------- 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/yt_public.py b/yt_public.py index 069ca12..4e88a51 100644 --- a/yt_public.py +++ b/yt_public.py @@ -128,9 +128,7 @@ def video_stats(youtube, videoIDs, channelID, to_csv=False): statistics['duration'] = response['items'][0]['contentDetails']['duration'] statistics['thumbnail'] = snippet['thumbnails']['high']['url'] statistics['channelId'] = channelID - - if statistics.get('likeCount', None) == None: - statistics['likeCount'] = 0 + statistics['likeCount'] = statistics.get('likeCount', 0) print(f"Fetched stats for {videoId}") stats_list.append(statistics) @@ -199,52 +197,4 @@ def comment_threads(youtube, videoID, channelID=None, to_csv=False): if __name__ == '__main__': pyscriptVidId = 'Qo8dXyKXyME' - channelId = 'UCzIxc8Vg53_ewaRIk3shBug' - - channelIds = [] - # with open('youtube.csv', 'r') as csvfile: - # reader = csv.reader(csvfile) - # next(reader) - # for row in reader: - # channelIds.append(row[2]) - - # channel_stats(youtube_2, channelIds, to_csv=True) - - # videoDict = {} - # for idx in range(len(channelIds)): - # if idx <= len(channelIds)/5: - # youtube = youtube_1 - # elif idx <= len(channelIds)/5*2: - # youtube = youtube_2 - # elif idx <= len(channelIds)/5*3: - # youtube = youtube_3 - # elif idx <= len(channelIds)/5*4: - # youtube = youtube_4 - # else: - # youtube = youtube_5 - # videoIds = get_video_ids(youtube, channelIds[idx]) - # videoDict[channelIds[idx]] = videoIds - - import json - # with open('videoDict.json', 'w') as fp: - # json.dump(videoDict, fp) - - with open('videoDict.json', 'r') as fp: - videoDict = json.load(fp) - - for channelId, videoIds in videoDict.items(): - video_stats(youtube_3, videoIDs=videoIds, channelID=channelId, to_csv=True) - - # for channelId, videoIds in videoDict.items(): - # for videoId in videoIds: - # comment_threads(youtube_3, videoID=videoId, channelID=channelId, to_csv=True) - - # with open('scrapedVideos.json', 'w') as fp: - # json.dump(scraped_videos, fp) - - # response = search_result("pyscript") - # response = channel_stats(youtube_2, channelId) - # response = comment_threads(youtube_2, videoID='pnecPXlfR5U', to_csv=False) - # print(response) - - #NOTES -> troublesome video (videoId: CqssnS_v1a4) -> no likecount, so I manually add 0 to it \ No newline at end of file + channelId = 'UCzIxc8Vg53_ewaRIk3shBug' \ No newline at end of file