Heartily/xmldownloader.py

import xml.etree.ElementTree as ET
import random
import time
from fake_useragent import UserAgent
import requests
import datetime

try:
    # Get the current date and time
    current_datetime = datetime.datetime.now()

    # Format the date and time as a string
    run_time = current_datetime.strftime("%Y-%m-%d %H:%M:%S")

    # Define the list of URLs
    urls = [
        "https://rss.indeed.com/rss?q=(it+OR+technology+OR+developer+OR+software)+AND+(bible+OR+christian+OR+jesus)+-LDS+-%22Latter-Day+Saints%22+-catholic+-christian.&fromage=14",
        "https://rss.indeed.com/rss?q=(marketing+OR+UI+OR+design)+AND+(bible+OR+christian+OR+jesus)+-LDS+-%22Latter-Day+Saints%22+-catholic+-christian.&fromage=14",
    ]

    # Set a custom user agent
    ua = UserAgent()
    headers = {
        'User-Agent': ua.random,
        'Referer': 'https://google.com',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }

    # Create a session object to handle cookies
    session = requests.Session()
    session.headers.update(headers)

    # Create a new XML root element to store all the extracted values
    output_root = ET.Element("root")

    for url in urls:
        try:
            # Download the XML file
            response = session.get(url)
            response.raise_for_status()  # Raise an exception for non-successful status codes

            content = response.content

            # Read the downloaded XML content
            tree = ET.fromstring(content)

            # Define the desired attribute names
            attribute_names = ["title", "link", "source", "guid", "pubDate", "description", "{http://www.georss.org/georss}point"]

            # Iterate over the "item" elements and extract the desired values
            for item in tree.findall(".//item"):
                new_element = ET.SubElement(output_root, "item")

                # Extract the desired attributes from the "item" element
                for attribute_name in attribute_names:
                    value = item.find(attribute_name).text
                    if value is not None:
                        new_element.set(attribute_name, value)
        except requests.exceptions.RequestException as e:
            # Handle request-related errors
            print("An error occurred while downloading the XML:", str(e))
        except (ET.ParseError, AttributeError) as e:
            # Handle XML parsing errors
            print("An error occurred while parsing the XML:", str(e))

    # Add a new element for the run time
    run_time_element = ET.SubElement(output_root, "run_time")
    run_time_element.text = run_time

    # Create an ElementTree object with the output root
    output_tree = ET.ElementTree(output_root)

    # Write the output to the XML file, overwriting the existing data
    output_filename = "./indeed_output.xml"
    with open(output_filename, 'wb') as f:
        output_tree.write(f, encoding="utf-8", xml_declaration=True)

    print("Script execution completed successfully.")

except Exception as e:
    # Handle any other uncaught exceptions
    print("An unexpected error occurred:", str(e))