Skip to content

Instantly share code, notes, and snippets.

@koaning
Last active October 1, 2022 15:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save koaning/19fd1ba63d79baa8bb38cf626b8c65c5 to your computer and use it in GitHub Desktop.
Save koaning/19fd1ba63d79baa8bb38cf626b8c65c5 to your computer and use it in GitHub Desktop.
HTML parsing benchmark
import timeit
import requests
import html_text
import justext
from selectolax.parser import HTMLParser
# There's a difference between these two
html_long = requests.get("http://planet.python.org/").content.decode("utf-8")
html_short = "<p><b>This</b> is just a small example.</p>"
# Run benchmark
htmls = {"long": html_long, "short": html_short}
results = {}
for html_name, html in htmls.items():
methods = {
"justext": lambda: justext.justext(html, tuple()),
"htmltext": lambda: html_text.extract_text(html, guess_layout=False),
"selectolax": lambda: HTMLParser(html).text()
}
for method, func in methods.items():
print(method, html_name)
results[(html_name, method)] = timeit.timeit(func, number=500)
# Pretty Print
from rich.console import Console
from rich.table import Table
table = Table(title="Benchmark Results")
table.add_column("HTML Variant", style="cyan")
table.add_column("Library", justify="right", style="magenta")
table.add_column("Time (s)", style="green")
for (variant, library), speed in results.items():
table.add_row(variant, library, str(speed))
console = Console()
console.print(table)
"""
Benchmark Results
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
┃ HTML Variant ┃ Library ┃ Time (s) ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
│ long │ justext │ 0.49477454099996976 │
│ long │ htmltext │ 10.072820916999945 │
│ long │ selectolax │ 14.882792749999908 │
│ short │ justext │ 0.025378625000030297 │
│ short │ htmltext │ 0.012803834000010283 │
│ short │ selectolax │ 0.004145417000017915 │
└──────────────┴────────────┴──────────────────────┘
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment