error handling added to downloader
This commit is contained in:
parent
3c33a88fe0
commit
8bc0d2dac7
114
xmldownloader.py
114
xmldownloader.py
@ -5,65 +5,81 @@ from fake_useragent import UserAgent
|
||||
import requests
|
||||
import datetime
|
||||
|
||||
# Get the current date and time
|
||||
current_datetime = datetime.datetime.now()
|
||||
try:
|
||||
# Get the current date and time
|
||||
current_datetime = datetime.datetime.now()
|
||||
|
||||
# Format the date and time as a string
|
||||
run_time = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
|
||||
# Format the date and time as a string
|
||||
run_time = current_datetime.strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
# Define the list of URLs
|
||||
urls = [
|
||||
"https://rss.indeed.com/rss?q=(it+OR+technology+OR+developer+OR+software)+AND+(bible+OR+christian+OR+jesus)+-LDS+-%22Latter-Day+Saints%22+-catholic+-christian.&fromage=14",
|
||||
"https://rss.indeed.com/rss?q=(marketing+OR+UI+OR+design)+AND+(bible+OR+christian+OR+jesus)+-LDS+-%22Latter-Day+Saints%22+-catholic+-christian.&fromage=14",
|
||||
]
|
||||
# Define the list of URLs
|
||||
urls = [
|
||||
"https://rss.indeed.com/rss?q=(it+OR+technology+OR+developer+OR+software)+AND+(bible+OR+christian+OR+jesus)+-LDS+-%22Latter-Day+Saints%22+-catholic+-christian.&fromage=14",
|
||||
"https://rss.indeed.com/rss?q=(marketing+OR+UI+OR+design)+AND+(bible+OR+christian+OR+jesus)+-LDS+-%22Latter-Day+Saints%22+-catholic+-christian.&fromage=14",
|
||||
]
|
||||
|
||||
# Set a custom user agent
|
||||
ua = UserAgent()
|
||||
headers = {
|
||||
'User-Agent': ua.random,
|
||||
'Referer': 'https://google.com',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
}
|
||||
# Set a custom user agent
|
||||
ua = UserAgent()
|
||||
headers = {
|
||||
'User-Agent': ua.random,
|
||||
'Referer': 'https://google.com',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
}
|
||||
|
||||
# Create a session object to handle cookies
|
||||
session = requests.Session()
|
||||
session.headers.update(headers)
|
||||
# Create a session object to handle cookies
|
||||
session = requests.Session()
|
||||
session.headers.update(headers)
|
||||
|
||||
# Create a new XML root element to store all the extracted values
|
||||
output_root = ET.Element("root")
|
||||
# Create a new XML root element to store all the extracted values
|
||||
output_root = ET.Element("root")
|
||||
|
||||
for url in urls:
|
||||
# Download the XML file
|
||||
response = session.get(url)
|
||||
content = response.content
|
||||
for url in urls:
|
||||
try:
|
||||
# Download the XML file
|
||||
response = session.get(url)
|
||||
response.raise_for_status() # Raise an exception for non-successful status codes
|
||||
|
||||
# Read the downloaded XML content
|
||||
tree = ET.fromstring(content)
|
||||
content = response.content
|
||||
|
||||
# Define the desired attribute names
|
||||
attribute_names = ["title", "link", "source", "guid", "pubDate", "description", "{http://www.georss.org/georss}point"]
|
||||
# Read the downloaded XML content
|
||||
tree = ET.fromstring(content)
|
||||
|
||||
# Iterate over the "item" elements and extract the desired values
|
||||
for item in tree.findall(".//item"):
|
||||
new_element = ET.SubElement(output_root, "item")
|
||||
# Define the desired attribute names
|
||||
attribute_names = ["title", "link", "source", "guid", "pubDate", "description", "{http://www.georss.org/georss}point"]
|
||||
|
||||
# Extract the desired attributes from the "item" element
|
||||
for attribute_name in attribute_names:
|
||||
value = item.find(attribute_name).text
|
||||
if value is not None:
|
||||
new_element.set(attribute_name, value)
|
||||
# Iterate over the "item" elements and extract the desired values
|
||||
for item in tree.findall(".//item"):
|
||||
new_element = ET.SubElement(output_root, "item")
|
||||
|
||||
# Add a new element for the run time
|
||||
run_time_element = ET.SubElement(output_root, "run_time")
|
||||
run_time_element.text = run_time
|
||||
# Extract the desired attributes from the "item" element
|
||||
for attribute_name in attribute_names:
|
||||
value = item.find(attribute_name).text
|
||||
if value is not None:
|
||||
new_element.set(attribute_name, value)
|
||||
except requests.exceptions.RequestException as e:
|
||||
# Handle request-related errors
|
||||
print("An error occurred while downloading the XML:", str(e))
|
||||
except (ET.ParseError, AttributeError) as e:
|
||||
# Handle XML parsing errors
|
||||
print("An error occurred while parsing the XML:", str(e))
|
||||
|
||||
# Create an ElementTree object with the output root
|
||||
output_tree = ET.ElementTree(output_root)
|
||||
# Add a new element for the run time
|
||||
run_time_element = ET.SubElement(output_root, "run_time")
|
||||
run_time_element.text = run_time
|
||||
|
||||
# Write the output to the XML file, overwriting the existing data
|
||||
output_filename = "./indeed_output.xml"
|
||||
with open(output_filename, 'wb') as f:
|
||||
output_tree.write(f, encoding="utf-8", xml_declaration=True)
|
||||
# Create an ElementTree object with the output root
|
||||
output_tree = ET.ElementTree(output_root)
|
||||
|
||||
# Write the output to the XML file, overwriting the existing data
|
||||
output_filename = "./indeed_output.xml"
|
||||
with open(output_filename, 'wb') as f:
|
||||
output_tree.write(f, encoding="utf-8", xml_declaration=True)
|
||||
|
||||
print("Script execution completed successfully.")
|
||||
|
||||
except Exception as e:
|
||||
# Handle any other uncaught exceptions
|
||||
print("An unexpected error occurred:", str(e))
|
||||
|
Reference in New Issue
Block a user