-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrafa_rss_metadata.py
More file actions
119 lines (108 loc) · 3.94 KB
/
trafa_rss_metadata.py
File metadata and controls
119 lines (108 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import re
import csv
import logging
import requests
import feedparser
from datetime import datetime
# Configure logging.
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s')
def parse_date(date_value):
"""
Parses date strings or datetime objects into "YYYY-MM-DD".
"""
if isinstance(date_value, datetime):
return date_value.strftime("%Y-%m-%d")
if isinstance(date_value, str):
try:
# Try ISO format first.
dt = datetime.fromisoformat(date_value)
return dt.strftime("%Y-%m-%d")
except Exception:
pass
# If the string starts with D: (PDF style), try to parse similarly.
if date_value.startswith("D:"):
d = date_value[2:]
match = re.match(r"(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})", d)
if match:
try:
dt = datetime.strptime("".join(match.groups()), "%Y%m%d%H%M%S")
return dt.strftime("%Y-%m-%d")
except Exception:
return date_value
return date_value
return "Unknown"
def sanitize_filename(name):
"""
Sanitizes a string to be used as a filename.
"""
return re.sub(r'[\\/*?:"<>|]', "_", name)
def download_html(url, base_folder, year, filename_hint):
"""
Downloads the HTML content of the given URL and saves it in:
base_folder/year/sanitized_filename.html
"""
try:
response = requests.get(url)
response.raise_for_status()
except Exception as e:
logging.error(f"Error downloading HTML from {url}: {e}")
return None
filename = sanitize_filename(filename_hint) + ".html"
folder_path = os.path.join(base_folder, year)
os.makedirs(folder_path, exist_ok=True)
local_path = os.path.join(folder_path, filename)
try:
with open(local_path, "w", encoding="utf-8") as f:
f.write(response.text)
logging.info(f"Saved HTML to {local_path}")
return local_path
except Exception as e:
logging.error(f"Error saving HTML file {local_path}: {e}")
return None
def process_rss_feed(feed_url):
"""
Processes the RSS feed and extracts metadata from each feed item.
Fields extracted:
- Dokumentnamn: From the feed item's title.
- Datum: From the feed item's published date (pubDate).
- url: From the feed item's link.
Also downloads the HTML of each feed item into a sorted folder structure.
"""
feed = feedparser.parse(feed_url)
items = []
base_download_folder = "trafa_rss_downloads"
for entry in feed.entries:
dokumentnamn = entry.get("title", "N/A")
raw_date = entry.get("published", "Unknown")
datum = parse_date(raw_date)
url = entry.get("link", feed_url)
# Determine year folder
year = datum.split("-")[0] if datum != "Unknown" else "unknown"
local_path = download_html(url, base_download_folder, year, dokumentnamn)
metadata = {
"Dokumentnamn": dokumentnamn,
"Datum": datum,
"url": url,
"LocalPath": local_path
}
items.append(metadata)
logging.info(f"Processed {len(items)} items from RSS feed {feed_url}")
return items
def main():
rss_feed_url = "https://www.trafa.se/kb-rss/"
items = process_rss_feed(rss_feed_url)
output_file = "trafa_rss_metadata.csv"
try:
with open(output_file, "w", newline="", encoding="utf-8") as f:
fieldnames = ["Dokumentnamn", "Datum", "url", "LocalPath"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for item in items:
writer.writerow(item)
logging.info(f"RSS metadata written to {output_file}")
except Exception as e:
logging.error(f"Error writing CSV file: {e}")
if __name__ == "__main__":
main()