Skip to content

Instantly share code, notes, and snippets.

@jskherman
Created March 7, 2024 16:28
Show Gist options
  • Save jskherman/6d14dfbfa51f63ea800931d8515e7704 to your computer and use it in GitHub Desktop.
Save jskherman/6d14dfbfa51f63ea800931d8515e7704 to your computer and use it in GitHub Desktop.
A python script that parses an Atom RSS feed from status.cafe and exports the data to a CSV file
# =============================================================================
#
# Author: jskherman
# Date: 2024-03-08
# Description:
# This script parses an XML file from status.cafe in the Atom format, which
# typically represents an RSS feed. The script extracts relevant information
# from the entries in the feed, such as the title, author, published
# timestamp, content, ID, and link. It then processes this information and
# writes it to a CSV file.
#
# Specifically, the script performs the following tasks:
#
# 1. Prompts the user to enter the name of the XML file to parse.
# 2. Parses the provided XML file using the xml.etree.ElementTree module.
# 3. Extracts the following information from each entry in the feed:
# - Title
# - Author
# - Published timestamp (converted to a UNIX timestamp)
# - Content (with HTML entities decoded)
# - ID (extracted from the URL in the 'id' node)
# - Link (for the 'alternate' link)
# - Emoji (extracted from the title, assuming it's the second word)
# 4. Checks for duplicate IDs in the feed entries. If any duplicates are
# found, it raises a ValueError with the duplicate IDs and their
# corresponding status updates.
# 5. Writes the extracted information to a CSV file named 'output.csv',
# with the following columns:
# - ID
# - Timestamp
# - Author
# - Emoji
# - Status
# - Link
# 6. Prints a success message if the execution is successful.
# 7. If an exception occurs during execution, it prints the error message.
#
# Dependencies:
# - xml.etree.ElementTree: for parsing the XML file
# - csv: for writing the data to a CSV file
# - html: for decoding HTML entities in the content
# - re: for using regular expressions to extract the ID from the URL
# - datetime: for converting the timestamp string to a UNIX timestamp
#
# Usage:
# 1. Save this script to a file (e.g., feedparser.py).
# 2. Run the script using the Python interpreter: `python feedparser.py`
# 3. When prompted, enter the name of the XML file to parse (e.g., feed.xml).
# 4. The script will process the file and generate an 'output.csv' file in
# the same directory.
# 5. If any errors occur, the script will print the error message and provide
# helpful information.
#
# =============================================================================
import xml.etree.ElementTree as ET
import csv
from html import unescape
import re
from datetime import datetime, timezone
def extract_emoji(title):
"""
Extract the emoji from the title string.
Args:
title (str): The title string from which to extract the emoji.
Returns:
str: The extracted emoji, or an empty string if no emoji is found.
"""
# Split the title by whitespace
title_parts = title.split()
# Get the second element as the emoji
if len(title_parts) > 1:
return title_parts[1]
else:
return ""
def get_status_id(id_url):
"""
Extract the status ID from the given URL.
Args:
id_url (str): The URL containing the status ID.
Returns:
str: The extracted status ID, or an empty string if no ID is found.
"""
# Extract the digits at the end of the URL
match = re.search(r"/(\d+)$", id_url)
if match:
return match.group(1)
else:
return ""
def main():
try:
# Ask the user for the file name
file_name = input("Enter the name of the file to parse: ")
# Parse the XML file
tree = ET.parse(file_name)
root = tree.getroot()
# Open a CSV file for writing
with open("output.csv", "w", newline="", encoding="utf-8") as csvfile:
fieldnames = [
"ID",
"Timestamp",
"Author",
"Emoji",
"Status",
"Link",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# Write the header row in the CSV file
writer.writeheader()
# Initialize a set to store unique IDs
unique_ids = set()
# Initialize a dictionary to store duplicate IDs and their corresponding status updates
duplicate_ids = {}
# Iterate over the entries in the XML file
for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
title = entry.find("{http://www.w3.org/2005/Atom}title").text
author = entry.find(
"{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name"
).text
timestamp_str = entry.find(
"{http://www.w3.org/2005/Atom}published"
).text
# Convert the timestamp string to a UNIX timestamp
timestamp = int(
datetime.fromisoformat(
timestamp_str.replace("Z", "+00:00")
).timestamp()
)
content = unescape(
entry.find("{http://www.w3.org/2005/Atom}content").text
)
id_url = entry.find("{http://www.w3.org/2005/Atom}id").text
status_id = get_status_id(id_url)
link = entry.find('{http://www.w3.org/2005/Atom}link[@rel="alternate"]')
if link is not None:
link = link.attrib["href"]
else:
link = ""
emoji_mood = extract_emoji(title)
# Check if the ID is already in the set
if status_id in unique_ids:
# If it's a duplicate, store it in the duplicate_ids dictionary
duplicate_ids[status_id] = content
else:
# If it's a unique ID, add it to the set and write the row to the CSV
unique_ids.add(status_id)
writer.writerow(
{
"ID": status_id,
"Timestamp": timestamp,
"Author": author,
"Emoji": emoji_mood,
"Status": content,
"Link": link,
}
)
# Check if there were any duplicate IDs
if duplicate_ids:
# Raise an error with the duplicate IDs and their corresponding status updates
duplicate_ids_str = "\n".join(
[
f"ID: {id}, Status: {duplicate_ids[id]}"
for id in duplicate_ids
]
)
raise ValueError(
f"Duplicate IDs found in the input file:\n{duplicate_ids_str}"
)
print("Execution successful! CSV file 'output.csv' has been created.")
except Exception as e:
print(f"An error occurred: {e}")
print("Please ensure that the file you provided is a valid XML file.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment