74 lines
2.3 KiB
Python
74 lines
2.3 KiB
Python
import xml.etree.ElementTree as ET
|
|
import urllib.request
|
|
import random
|
|
import time
|
|
from fake_useragent import UserAgent
|
|
import requests
|
|
import datetime
|
|
|
|
# Get the current date and time
|
|
current_datetime = datetime.datetime.now()
|
|
|
|
# Format the date and time as a string
|
|
run_time = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
# Print the run time
|
|
print("Script run time:", run_time)
|
|
|
|
# Define the URL of the XML file to download
|
|
url = "https://rss.indeed.com/rss?q=(it+OR+technology+OR+developer+OR+software)+AND+(bible+OR+christian+OR+jesus+OR+god)+-LDS+-%22Latter-Day+Saints%22.xml"
|
|
|
|
# Set a custom user agent
|
|
ua = UserAgent()
|
|
headers = {
|
|
'User-Agent': ua.random,
|
|
'Referer': 'https://google.com',
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
}
|
|
|
|
# Create a session object to handle cookies
|
|
session = requests.Session()
|
|
session.headers.update(headers)
|
|
|
|
# Download the XML file
|
|
response = session.get(url)
|
|
filename = "/home/gordon/Documents/Code/heartily/indeed_input.xml"
|
|
with open(filename, 'wb') as f:
|
|
f.write(response.content)
|
|
|
|
# Introduce a random delay between 2 and 5 seconds
|
|
delay = random.uniform(2, 5)
|
|
time.sleep(delay)
|
|
|
|
# Read the downloaded XML file
|
|
tree = ET.parse(filename)
|
|
root = tree.getroot()
|
|
|
|
# Define the desired attribute names
|
|
attribute_names = ["title", "link", "source", "guid", "pubDate", "description", "{http://www.georss.org/georss}point"]
|
|
|
|
# Create a new XML root element to store the extracted values
|
|
new_root = ET.Element("root")
|
|
|
|
# Iterate over the "item" elements and extract the desired values
|
|
for item in root.findall(".//item"):
|
|
new_element = ET.SubElement(new_root, "item")
|
|
|
|
# Extract the desired attributes from the "item" element
|
|
for attribute_name in attribute_names:
|
|
value = item.find(attribute_name).text
|
|
if value is not None:
|
|
new_element.set(attribute_name, value)
|
|
|
|
# Add a new element for the run time
|
|
run_time_element = ET.SubElement(new_root, "run_time")
|
|
run_time_element.text = run_time
|
|
|
|
# Create an ElementTree object and write it to a new XML file
|
|
new_tree = ET.ElementTree(new_root)
|
|
output_filename = "/home/gordon/Documents/Code/heartily/indeed_output.xml"
|
|
new_tree.write(output_filename, encoding="utf-8", xml_declaration=True)
|