aboutsummaryrefslogtreecommitdiff
path: root/gemini-to-web/src/gemini_to_web
diff options
context:
space:
mode:
Diffstat (limited to 'gemini-to-web/src/gemini_to_web')
-rw-r--r--gemini-to-web/src/gemini_to_web/__init__.py25
-rw-r--r--gemini-to-web/src/gemini_to_web/html.py106
-rw-r--r--gemini-to-web/src/gemini_to_web/parser.py233
3 files changed, 364 insertions, 0 deletions
diff --git a/gemini-to-web/src/gemini_to_web/__init__.py b/gemini-to-web/src/gemini_to_web/__init__.py
new file mode 100644
index 0000000..0b2b70e
--- /dev/null
+++ b/gemini-to-web/src/gemini_to_web/__init__.py
@@ -0,0 +1,25 @@
+import argparse
+import pathlib
+import shutil
+
+import htmlgenerator
+
+from gemini_to_web import html
+
+def converter():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("source", type=pathlib.Path)
+ parser.add_argument("target", type=pathlib.Path)
+ args = parser.parse_args()
+
+ shutil.copytree(args.source, args.target)
+ for gmi in args.target.glob("**/*.gmi"):
+ html_path = gmi.with_suffix(".html")
+ html_path.write_text(
+ html.pretty(
+ htmlgenerator.render(
+ html.to_html(gmi.read_text()),
+ {}
+ )
+ )
+ )
diff --git a/gemini-to-web/src/gemini_to_web/html.py b/gemini-to-web/src/gemini_to_web/html.py
new file mode 100644
index 0000000..9594155
--- /dev/null
+++ b/gemini-to-web/src/gemini_to_web/html.py
@@ -0,0 +1,106 @@
+import typing
+
+import htmlgenerator
+from lxml import etree, html
+
+from gemini_to_web import parser
+
+
+def first_header_title_extractor(parsed: list[parser.GemElement]):
+ heading_lines = [element for element in parsed if isinstance(element, parser.HeadingLine)]
+ if heading_lines:
+ return heading_lines[0].heading_text
+
+
+def to_html(parsed: list[parser.GemElement], title_extractor=first_header_title_extractor):
+ body = []
+ building_element = None
+ building_content = None
+
+ def close(body, building_element, building_content):
+ if building_element and building_content:
+ body.append(building_element(*building_content))
+ return (body, None, None)
+
+ head = []
+
+ if title_extractor:
+ head.append(htmlgenerator.TITLE(title_extractor(parsed)))
+
+ for item in parsed:
+ match item:
+ case parser.HeadingLine(level, heading_text):
+ if building_element:
+ body, building_element, building_content = close(body, building_element, building_content)
+ headers = [htmlgenerator.H1, htmlgenerator.H2, htmlgenerator.H3]
+ body.append(headers[level-1](heading_text))
+ case parser.QuoteLine(text):
+ # https://geminiprotocol.net/docs/gemtext.gmi#blockquotes says:
+ #
+ # > The quoted content is written as a single long line [...]
+ if building_element:
+ body, building_element, building_content = close(body, building_element, building_content)
+ body.append(htmlgenerator.BLOCKQUOTE(text))
+ case parser.PreformattingToggleLine(alt_text):
+ if building_element == htmlgenerator.PRE:
+ assert not alt_text, f"Closing preformatting toggle line with alt text {alt_text}"
+ body, building_element, building_content = close(body, building_element, building_content)
+ else:
+ body, building_element, building_content = close(body, building_element, building_content)
+ building_element = htmlgenerator.PRE
+ building_content = ""
+ case parser.PreformattedTextLine(text):
+ assert building_element == htmlgenerator.PRE
+ building_content += text
+ building_content += "\n"
+ case parser.TextLine(""):
+ if building_element:
+ body, building_element, building_content = close(body, building_element, building_content)
+ case parser.TextLine(text):
+ if building_element == htmlgenerator.P:
+ building_content += [htmlgenerator.BR(), text]
+ continue
+ elif building_element is not None and building_element != htmlgenerator.P:
+ body, building_element, building_content = close(body, building_element, building_content)
+ building_element = htmlgenerator.P
+ building_content = [text]
+ case parser.LinkLine(url, link_name):
+ if building_element == htmlgenerator.P:
+ building_content += [htmlgenerator.BR(), htmlgenerator.A(link_name, href=url)]
+ continue
+ elif building_element is not None and building_element != htmlgenerator.P:
+ body, building_element, building_content = close(body, building_element, building_content)
+ building_element = htmlgenerator.P
+ building_content = [htmlgenerator.A(link_name, href=url)]
+ case parser.ListItem(text):
+ if building_element == htmlgenerator.UL:
+ building_content.append(htmlgenerator.LI(text))
+ continue
+ elif building_element is not None and building_element != htmlgenerator.UL:
+ body, building_element, building_content = close(body, building_element, building_content)
+ building_element = htmlgenerator.UL
+ building_content = [htmlgenerator.LI(text)]
+ case _:
+ assert False, f"unknown element {item}"
+
+ close(body, building_element, building_content)
+ html = htmlgenerator.HTML(
+ htmlgenerator.HEAD(*head),
+ htmlgenerator.BODY(*body),
+ )
+ return html
+
+
+def pretty(s):
+ return etree.tostring(html.fromstring(s), pretty_print=True).decode("utf8")
+
+
+def cli_to_html():
+ import sys
+ input_ = sys.stdin.read()
+ gemtext = parser.parse(input_)
+ gemtext = list(gemtext)
+ html = to_html(gemtext)
+ rendered = htmlgenerator.render(html, {})
+ rendered = pretty(rendered)
+ print(rendered)
diff --git a/gemini-to-web/src/gemini_to_web/parser.py b/gemini-to-web/src/gemini_to_web/parser.py
new file mode 100644
index 0000000..358cada
--- /dev/null
+++ b/gemini-to-web/src/gemini_to_web/parser.py
@@ -0,0 +1,233 @@
+import dataclasses
+from collections import abc
+import typing
+
+
+type GemElement = PreformattingToggleLine | PreformattedTextLine | LinkLine | ListItem | QuoteLine | HeadingLine | TextLine
+
+
+def _test(gemtext: str):
+ return list(parse(gemtext.lstrip()))
+
+
+def parse(gemtext: str) -> abc.Generator[GemElement]:
+ """
+ >>> _test('''
+ ... This is a test.
+ ... ''')
+ [TextLine(text='This is a test.')]
+
+ >>> _test('''
+ ... This is a test.
+ ...
+ ... => https://www.example.com/ Link
+ ... ''')
+ [TextLine(text='This is a test.'), TextLine(text=''), LinkLine(url='https://www.example.com/', link_name='Link')]
+
+ >>> _test('''
+ ... This is a test
+ ...
+ ... ```foo
+ ... bar
+ ... ```
+ ... ''')
+ [TextLine(text='This is a test'), TextLine(text=''), PreformattingToggleLine(alt_text='foo'), PreformattedTextLine(text='bar'), PreformattingToggleLine(alt_text=None)]
+
+ >>> _test('''
+ ... # Welcome
+ ...
+ ... Something.
+ ... ''')
+ [HeadingLine(level=1, heading_text='Welcome'), TextLine(text=''), TextLine(text='Something.')]
+
+ >>> _test('''
+ ... This is a test.
+ ...
+ ... * Item 1
+ ... * Item 2
+ ... ''')
+ [TextLine(text='This is a test.'), TextLine(text=''), ListItem(text='Item 1'), ListItem(text='Item 2')]
+
+ >>> _test('''
+ ... This is a test.
+ ...
+ ... > Line 1
+ ... > Line 2
+ ... ''')
+ [TextLine(text='This is a test.'), TextLine(text=''), QuoteLine(text='Line 1'), QuoteLine(text='Line 2')]
+ """
+
+ current_preformatting_toggle_line = None
+
+ for line in gemtext.splitlines():
+ preformatting_toggle_line = PreformattingToggleLine.parse(line)
+
+ if current_preformatting_toggle_line:
+ if preformatting_toggle_line:
+ assert not preformatting_toggle_line.alt_text, 'closing preformatting toggle line with alt text {preformatting_toggle_line.alt_text}'
+ current_preformatting_toggle_line = None
+ yield preformatting_toggle_line
+ continue
+ else:
+ yield PreformattedTextLine(line)
+ continue
+
+ if preformatting_toggle_line:
+ current_preformatting_toggle_line = preformatting_toggle_line
+ yield preformatting_toggle_line
+ continue
+ elif current_preformatting_toggle_line and not preformatting_toggle_line:
+ current_preformatting_toggle_line = None
+
+ link_line = LinkLine.parse(line)
+ if link_line:
+ yield link_line
+ continue
+
+ list_item = ListItem.parse(line)
+ if list_item:
+ yield list_item
+ continue
+
+ quote_line = QuoteLine.parse(line)
+ if quote_line:
+ yield quote_line
+ continue
+
+ heading_line = HeadingLine.parse(line)
+ if heading_line:
+ yield heading_line
+ continue
+
+ yield TextLine(line)
+
+
+@dataclasses.dataclass
+class TextLine:
+ text: str
+
+
+@dataclasses.dataclass
+class PreformattedTextLine(TextLine):
+ pass
+
+@dataclasses.dataclass
+class LinkLine:
+ url: str
+ link_name: typing.Optional[str] = None
+
+ @staticmethod
+ def parse(line: str):
+ """
+ >>> LinkLine.parse('Not a link line')
+
+ >>> LinkLine.parse('=>foo')
+ LinkLine(url='foo', link_name=None)
+
+ >>> LinkLine.parse('=> foo')
+ LinkLine(url='foo', link_name=None)
+
+ >>> LinkLine.parse('=> foo bar')
+ LinkLine(url='foo', link_name='bar')
+ """
+ if not line.startswith('=>'):
+ return None
+ line = line.removeprefix('=>')
+ line = line.lstrip()
+ parts = line.split(maxsplit=1)
+ assert len(parts) in (1,2)
+ if len(parts) == 1:
+ return LinkLine(parts[0])
+ return LinkLine(parts[0], parts[1])
+
+
+@dataclasses.dataclass
+class PreformattingToggleLine:
+ alt_text: typing.Optional[str]
+
+ @staticmethod
+ def parse(line: str):
+ """
+ >>> PreformattingToggleLine.parse('Not a preformatting toggle line')
+
+ >>> PreformattingToggleLine.parse('```')
+ PreformattingToggleLine(alt_text=None)
+
+ >>> PreformattingToggleLine.parse('```alt')
+ PreformattingToggleLine(alt_text='alt')
+ """
+ if not line.startswith('```'):
+ return None
+ line = line.removeprefix('```')
+ line: typing.Optional[str] = line if line else None
+ return PreformattingToggleLine(line)
+
+
+@dataclasses.dataclass
+class HeadingLine:
+ level: int
+ heading_text: str
+
+ @staticmethod
+ def parse(line: str):
+ """
+ >>> HeadingLine.parse('Not a heading line')
+
+ >>> HeadingLine.parse('# Foo')
+ HeadingLine(level=1, heading_text='Foo')
+ """
+ parts = line.split(maxsplit=1)
+ if len(parts) < 2:
+ return None
+ if not parts[0] in ('#', '##', '###'):
+ return None
+ return HeadingLine(len(parts[0]), parts[1])
+
+
+@dataclasses.dataclass
+class ListItem:
+ text: str
+
+ @staticmethod
+ def parse(line: str):
+ """
+ >>> ListItem.parse('Not a list item')
+
+ >>> ListItem.parse('* Foo')
+ ListItem(text='Foo')
+ """
+ if not line.startswith("* "):
+ return None
+ return ListItem(line.removeprefix("* "))
+
+
+@dataclasses.dataclass
+class QuoteLine:
+ text: str
+
+ @staticmethod
+ def parse(line: str):
+ """
+ >>> QuoteLine.parse('Not a quote line')
+
+ >>> QuoteLine.parse('> Foo')
+ QuoteLine(text='Foo')
+ """
+ if not line.startswith("> "):
+ return None
+ return QuoteLine(line.removeprefix("> "))
+
+
+def cli_parse():
+ import json, sys
+
+ def dataclass_json(obj):
+ assert dataclasses.is_dataclass(obj)
+ d = dataclasses.asdict(obj)
+ d["type"] = type(obj).__name__
+ return d
+
+ input_ = sys.stdin.read()
+ gemtext = parse(input_)
+ as_json = json.dumps(list(gemtext), default=dataclass_json)
+ print(as_json)