Skip to content

Instantly share code, notes, and snippets.

@Modder4869
Last active June 21, 2023 20:55
Show Gist options
  • Save Modder4869/89264212768240a43d1efb4764792d04 to your computer and use it in GitHub Desktop.
Save Modder4869/89264212768240a43d1efb4764792d04 to your computer and use it in GitHub Desktop.
import requests
import io
import struct
import zipfile
import binascii
import re
import sys
import os
from tqdm import tqdm
#copied from https://betterprogramming.pub/how-to-know-zip-content-without-downloading-it-87a5b30be20a
EOCD_RECORD_SIZE = 22
ZIP64_EOCD_RECORD_SIZE = 56
ZIP64_EOCD_LOCATOR_SIZE = 20
MAX_STANDARD_ZIP_SIZE = 4_294_967_295
def retrieve_zip_content(url, regex_pattern, output_folder):
zip_file = get_zip_file(url)
extract_matching_files(zip_file, regex_pattern, output_folder)
def get_zip_file(url):
file_size = get_file_size(url)
eocd_record = fetch(url, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE)
if file_size <= MAX_STANDARD_ZIP_SIZE:
cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
central_directory = fetch(url, cd_start, cd_size)
return zipfile.ZipFile(io.BytesIO(central_directory + eocd_record))
else:
zip64_eocd_record = fetch(
url,
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
ZIP64_EOCD_RECORD_SIZE,
)
zip64_eocd_locator = fetch(
url,
file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
ZIP64_EOCD_LOCATOR_SIZE,
)
cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
central_directory = fetch(url, cd_start, cd_size)
return zipfile.ZipFile(
io.BytesIO(central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record)
)
def get_file_size(url):
response = requests.head(url)
if response.status_code == 200:
content_length = response.headers.get("Content-Length")
return int(content_length)
else:
raise ValueError("Error retrieving file size:", response.status_code)
def fetch(url, start, length):
end = start + length - 1
headers = {"Range": f"bytes={start}-{end}"}
response = requests.get(url, headers=headers)
return response.content
def get_central_directory_metadata_from_eocd(eocd):
cd_size = parse_little_endian_to_int(eocd[12:16])
cd_start = parse_little_endian_to_int(eocd[16:20])
return cd_start, cd_size
def get_central_directory_metadata_from_eocd64(eocd64):
cd_size = parse_little_endian_to_int(eocd64[40:48])
cd_start = parse_little_endian_to_int(eocd64[48:56])
return cd_start, cd_size
def parse_little_endian_to_int(little_endian_bytes):
format_character = "i" if len(little_endian_bytes) == 4 else "q"
return struct.unpack("<" + format_character, little_endian_bytes)[0]
def extract_matching_files(zip_file, regex_pattern, output_folder):
central_directory = zip_file.filelist
idk = central_directory[0].header_offset # first entry
for zi in central_directory:
local_header_offset = zi.header_offset - idk
# print(zi.filename)
if re.search(regex_pattern, zi.filename, re.IGNORECASE):
file_name = zi.filename.split("/")[-1]
print(f"downloading {file_name}")
output_path = f"{output_folder}/{file_name}.zip"
os.makedirs(os.path.dirname(output_path), exist_ok=True) # Create parent directories if they don't exist
with open(output_path, "wb") as f:
size = struct.unpack("<h", zi.FileHeader()[26:28])[0] + 28 + zi.compress_size + 2
# Download with progress bar and speed indicator
headers = {"Range": f"bytes={local_header_offset}-{local_header_offset + size - 1}"}
response = requests.get(zipUrl, stream=True, headers=headers)
total_length = int(size)
progress_bar = tqdm(total=total_length, unit="B", unit_scale=True, unit_divisor=1024, ncols=80)
for data in response.iter_content(chunk_size=8192):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
print(f"Extracted: {output_path}")
zip_file.close()
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python script.py <zip_url> <regex_pattern> <output_folder>")
sys.exit(1)
zipUrl = sys.argv[1]
regexPattern = sys.argv[2]
outputFolder = sys.argv[3]
retrieve_zip_content(zipUrl, regexPattern, outputFolder)
@Matsko3
Copy link

Matsko3 commented Jun 21, 2023

very very crazy code

@Modder4869
Copy link
Author

very very crazy code

😔

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment