Skip to content

Instantly share code, notes, and snippets.

@conceptofmind
Last active June 19, 2023 18:00
Show Gist options
  • Save conceptofmind/26c76c2d2732ac5e931ed52390089a0f to your computer and use it in GitHub Desktop.
Save conceptofmind/26c76c2d2732ac5e931ed52390089a0f to your computer and use it in GitHub Desktop.
import re
from my_secret_keys import api_key, cse_id
from googleapiclient.discovery import build
from fuzzywuzzy import fuzz
from playwright.sync_api import sync_playwright
# get the google search api result
def google_search(input_query: str, api_key: str, cse_id: str):
num_results = 3
results = custom_search(
input_query, num=num_results, api_key=api_key, cse_id=cse_id
)
if results:
return results #[0]
return None
def custom_search(query, api_key, cse_id, **kwargs):
service = build("customsearch", "v1", developerKey=api_key)
res = service.cse().list(q=query, cx=cse_id, **kwargs).execute()
return res["items"]
# scrape the text from the webpage
def scrape_text(url):
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
page = context.new_page()
try:
page.goto(url)
snippet = extract_snippet(page)
return snippet
finally:
context.close()
browser.close()
def extract_snippet(page):
paragraphs = page.query_selector_all("p")
for paragraph in paragraphs:
text = paragraph.inner_text()
if len(text) >= 500:
return text[:500]
return None
def truncate_document(document, snippet, match_ratio_threshold=0.75, before=100, max_length=500):
match = fuzz.token_set_ratio(document, snippet)
if match < match_ratio_threshold:
return snippet
snippet_position = document.find(snippet)
start = max(0, snippet_position - before)
end = min(len(document), start + max_length)
truncated_document = document[start:end]
return truncated_document
def get_search_result(query, api_key, cse_id):
search_results = google_search(query, api_key, cse_id)
if search_results:
for result in search_results:
url = result['link']
snippet = result['snippet']
search_result = scrape_text(url)
truncated_result = truncate_document(search_result, snippet)
# Check if suitable text is found
suitable_text_found = (truncated_result != snippet) #and (truncated_result != search_result)
if suitable_text_found:
return truncated_result
return "No search results found."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment