Heartily/xmldownloader.py

import xml.etree.ElementTree as ET
import random
import time
from fake_useragent import UserAgent
import requests
import datetime

# Get the current date and time
current_datetime = datetime.datetime.now()

# Format the date and time as a string
run_time = current_datetime.strftime("%Y-%m-%d %H:%M:%S")

# Define the list of URLs
urls = [
    "https://rss.indeed.com/rss?q=(it+OR+technology+OR+developer+OR+software)+AND+(bible+OR+christian+OR+jesus)+-LDS+-%22Latter-Day+Saints%22+-catholic+-christian.&fromage=14",
    "https://rss.indeed.com/rss?q=(marketing+OR+UI+OR+design)+AND+(bible+OR+christian+OR+jesus)+-LDS+-%22Latter-Day+Saints%22+-catholic+-christian.&fromage=14",
]

# Set a custom user agent
ua = UserAgent()
headers = {
    'User-Agent': ua.random,
    'Referer': 'https://google.com',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
}

# Create a session object to handle cookies
session = requests.Session()
session.headers.update(headers)

# Create a new XML root element to store all the extracted values
output_root = ET.Element("root")

for url in urls:
    # Download the XML file
    response = session.get(url)
    content = response.content

    # Read the downloaded XML content
    tree = ET.fromstring(content)

    # Define the desired attribute names
    attribute_names = ["title", "link", "source", "guid", "pubDate", "description", "{http://www.georss.org/georss}point"]

    # Iterate over the "item" elements and extract the desired values
    for item in tree.findall(".//item"):
        new_element = ET.SubElement(output_root, "item")

        # Extract the desired attributes from the "item" element
        for attribute_name in attribute_names:
            value = item.find(attribute_name).text
            if value is not None:
                new_element.set(attribute_name, value)

# Add a new element for the run time
run_time_element = ET.SubElement(output_root, "run_time")
run_time_element.text = run_time

# Create an ElementTree object with the output root
output_tree = ET.ElementTree(output_root)

# Write the output to the XML file, overwriting the existing data
output_filename = "./indeed_output.xml"
with open(output_filename, 'wb') as f:
    output_tree.write(f, encoding="utf-8", xml_declaration=True)