gemini2rss/gemini2rss.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

import datetime
import re
import sys
import urllib.request

import bs4
from feedgen import feed

url = sys.argv[1]
feed_size = int(sys.argv[2])
title = sys.argv[3]
id = sys.argv[4]
author_name = sys.argv[5]
author_email = sys.argv[6]


with urllib.request.urlopen(url) as f: 
    content = f.read().decode("utf8")


soup = bs4.BeautifulSoup(content, features="lxml")

posts = 0

f = feed.FeedGenerator()
f.title(title)
f.id(id + "/")

for a in soup.find_all("a"):
    if posts == feed_size:
        break
    match = re.fullmatch("(....-..-..) (.*)", a.string)
    if not match:
        continue
    title = match.group(2)
    date = datetime.datetime.strptime(match.group(1), "%Y-%m-%d").date()
    
    fi = f.add_item()
    fi.title(title)
    fi.id(url + a["href"])
    fi.link(href=a["href"])
    fi.author(name=author_name, email=author_email)
    fi.updated(datetime.datetime.combine(date, datetime.datetime.min.time(), tzinfo=datetime.timezone.utc))

    node = a.parent.next_sibling
    content = ""
    while True:
        node = node.next_sibling
        if node and node.name == "p" and node.a and node.a.string and re.fullmatch("(....-..-..) (.*)", node.a.string):
            break
        if node and node.name == "h1" and node.string and node.string == "Sobre mí":
            break
        content += node if isinstance(node, bs4.NavigableString) else node.prettify() 
    fi.content(content, type="html")

    posts += 1

print(f.atom_str(pretty=True).decode("utf8"))