-
-
Save pukkandan/ee737fec64822f2552caf3ca4cbf5db7 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
""" | |
SPDX-License-Identifier: MIT https://opensource.org/licenses/MIT | |
Copyright © 2021 pukkandan.ytdlp@gmail.com | |
* Input file is an info.json (with comments) that yt-dlp (https://github.com/yt-dlp/yt-dlp) wrote | |
* Change FIELDS according to your needs | |
The output file will be in the format: | |
[{ | |
'text': 'comment 1', | |
... | |
'replies': [{ | |
'text': 'reply 1', | |
... | |
'replies': [...], | |
}, ...], | |
}, ...] | |
""" | |
import os.path | |
import json | |
import argparse | |
from datetime import datetime | |
def get_fields(dct): | |
for name, fn in FIELDS.items(): | |
val = fn(dct, name) | |
if val is not None: | |
yield name, val | |
def filter_func(comments): | |
return [dict(get_fields(c)) for c in comments] | |
FIELDS = { | |
'text': dict.get, | |
'author': dict.get, | |
'timestamp': lambda dct, name: dct.get(name) and datetime.strftime( | |
datetime.utcfromtimestamp(dct.get(name)), '%Y/%m/%d'), | |
# Add more fields here | |
'replies': lambda dct, name: filter_func(dct.get(name, [])) or None | |
} | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
'--input-file', '-i', | |
dest='inputfile', metavar='FILE', required=True, | |
help='File to read video metadata from (info.json)') | |
parser.add_argument( | |
'--output-file', '-o', | |
dest='outputfile', metavar='FILE', required=True, | |
help='File to write comments to (json / html)') | |
args = parser.parse_args() | |
ext = os.path.splitext(args.outputfile)[1][1:] | |
if ext == 'html': # Error early | |
try: | |
from json2html import json2html | |
except ImportError: | |
raise SystemExit('ERROR: json2html is required for html output. Install it with pip install json2html') | |
elif ext != 'json': | |
raise SystemExit(f'ERROR: Only json and html formats are supported, not {ext}') | |
print('Reading file') | |
with open(args.inputfile, encoding='utf-8') as f: | |
info_dict = json.load(f) | |
comment_data = {c['id']: c for c in sorted( | |
info_dict['comments'], key=lambda c: c.get('timestamp') or 0)} | |
count = len(info_dict['comments']) | |
del info_dict | |
nested_comments = [] | |
for i, (cid, c) in enumerate(comment_data.items(), 1): | |
print(f'Processing comment {i}/{count}', end='\r') | |
parent = nested_comments if c['parent'] == 'root' else comment_data[c['parent']].setdefault('replies', []) | |
parent.append(c) | |
del parent | |
print('') | |
nested_comments = filter_func(nested_comments) | |
if ext == 'json': | |
print('Converting to json') | |
out = json.dumps(nested_comments, indent=4, ensure_ascii=False) | |
elif ext == 'html': | |
print('Converting to html') | |
out = json2html.convert(nested_comments) | |
del nested_comments | |
print('Writing file') | |
with open(args.outputfile, 'w', encoding='utf-8') as f: | |
f.write(out) | |
print('Done') |
@VXsz fixed. Python not using utf-8 by default always trips me up
Is it possible to make this script also deal with comments on Rumble videos?
It should work for any comments written by yt-dlp, irrespective of website
Have you considered outputting to HTML page? Markup could pack more information and be more pleasant to read.
Have you considered outputting to HTML page? Markup could pack more information and be more pleasant to read.
do "pip install json2html" and add the following code to the end of this python file, so it will generate a readable HTML file
print('\nWriting html')
from json2html import *
comments_html = json2html.convert(json = filter_func(nested_comments))
with open (str(args.outputfile)+".html", 'w') as f:
f.write(comments_html)
print('Done')
Done. Though the html is ugly and I'm not gonna put in the effort to make it prettier.
Have you considered outputting to HTML page? Markup could pack more information and be more pleasant to read.
do "pip install json2html" and add the following code to the end of this python file, so it will generate a readable HTML file
print('\nWriting html') from json2html import * comments_html = json2html.convert(json = filter_func(nested_comments)) with open (str(args.outputfile)+".html", 'w') as f: f.write(comments_html) print('Done')
Hello
For downloading comments on Youtube
Can you teach me how to use Visual Studio Code or Yt-dlp to download comments of video as HTML format? Which script and which way should I conduct?
Can I download only comments (without download URL video file of Youtube) of the whole playlist or the whole channel ?
Thank you
btw some files will fail due to some encoding issues, using
open(args.inputfile, encoding="UTF-8")
would be better.I usually get this issue with files that have my language (and other shenanigans)