blob: 3803be7f3b3544e8f8f095f1ecdb73c31c2300cb (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
import datetime
import re
import sys
import urllib.request
import bs4
from feedgen import feed
url = sys.argv[1]
feed_size = int(sys.argv[2])
title = sys.argv[3]
id = sys.argv[4]
author_name = sys.argv[5]
author_email = sys.argv[6]
with urllib.request.urlopen(url) as f:
content = f.read().decode("utf8")
soup = bs4.BeautifulSoup(content, features="lxml")
posts = 0
f = feed.FeedGenerator()
f.title(title)
f.id(id + "/")
for a in soup.find_all("a"):
if posts == feed_size:
break
match = re.fullmatch("(....-..-..) (.*)", a.string)
if not match:
continue
title = match.group(2)
date = datetime.datetime.strptime(match.group(1), "%Y-%m-%d").date()
fi = f.add_item()
fi.title(title)
fi.id(url + a["href"])
fi.link(href=a["href"])
fi.author(name=author_name, email=author_email)
fi.updated(datetime.datetime.combine(date, datetime.datetime.min.time(), tzinfo=datetime.timezone.utc))
node = a.parent.next_sibling
content = ""
while True:
node = node.next_sibling
if node and node.name == "p" and node.a and node.a.string and re.fullmatch("(....-..-..) (.*)", node.a.string):
break
if node and node.name == "h1" and node.string and node.string == "Sobre mí":
break
content += node if isinstance(node, bs4.NavigableString) else node.prettify()
fi.content(content, type="html")
posts += 1
print(f.atom_str(pretty=True).decode("utf8"))
|