From 7f5e2cdc0b81d46fd6255ebae8cf4f6c92c2030d Mon Sep 17 00:00:00 2001 From: alex Date: Sat, 14 Feb 2026 21:23:16 +0100 Subject: Import initial gemini-to-web --- gemini-to-web/src/gemini_to_web/__init__.py | 25 +++ gemini-to-web/src/gemini_to_web/html.py | 106 +++++++++++++ gemini-to-web/src/gemini_to_web/parser.py | 233 ++++++++++++++++++++++++++++ 3 files changed, 364 insertions(+) create mode 100644 gemini-to-web/src/gemini_to_web/__init__.py create mode 100644 gemini-to-web/src/gemini_to_web/html.py create mode 100644 gemini-to-web/src/gemini_to_web/parser.py (limited to 'gemini-to-web/src/gemini_to_web') diff --git a/gemini-to-web/src/gemini_to_web/__init__.py b/gemini-to-web/src/gemini_to_web/__init__.py new file mode 100644 index 0000000..0b2b70e --- /dev/null +++ b/gemini-to-web/src/gemini_to_web/__init__.py @@ -0,0 +1,25 @@ +import argparse +import pathlib +import shutil + +import htmlgenerator + +from gemini_to_web import html + +def converter(): + parser = argparse.ArgumentParser() + parser.add_argument("source", type=pathlib.Path) + parser.add_argument("target", type=pathlib.Path) + args = parser.parse_args() + + shutil.copytree(args.source, args.target) + for gmi in args.target.glob("**/*.gmi"): + html_path = gmi.with_suffix(".html") + html_path.write_text( + html.pretty( + htmlgenerator.render( + html.to_html(gmi.read_text()), + {} + ) + ) + ) diff --git a/gemini-to-web/src/gemini_to_web/html.py b/gemini-to-web/src/gemini_to_web/html.py new file mode 100644 index 0000000..9594155 --- /dev/null +++ b/gemini-to-web/src/gemini_to_web/html.py @@ -0,0 +1,106 @@ +import typing + +import htmlgenerator +from lxml import etree, html + +from gemini_to_web import parser + + +def first_header_title_extractor(parsed: list[parser.GemElement]): + heading_lines = [element for element in parsed if isinstance(element, parser.HeadingLine)] + if heading_lines: + return heading_lines[0].heading_text + + +def to_html(parsed: list[parser.GemElement], title_extractor=first_header_title_extractor): + body = [] + building_element = None + building_content = None + + def close(body, building_element, building_content): + if building_element and building_content: + body.append(building_element(*building_content)) + return (body, None, None) + + head = [] + + if title_extractor: + head.append(htmlgenerator.TITLE(title_extractor(parsed))) + + for item in parsed: + match item: + case parser.HeadingLine(level, heading_text): + if building_element: + body, building_element, building_content = close(body, building_element, building_content) + headers = [htmlgenerator.H1, htmlgenerator.H2, htmlgenerator.H3] + body.append(headers[level-1](heading_text)) + case parser.QuoteLine(text): + # https://geminiprotocol.net/docs/gemtext.gmi#blockquotes says: + # + # > The quoted content is written as a single long line [...] + if building_element: + body, building_element, building_content = close(body, building_element, building_content) + body.append(htmlgenerator.BLOCKQUOTE(text)) + case parser.PreformattingToggleLine(alt_text): + if building_element == htmlgenerator.PRE: + assert not alt_text, f"Closing preformatting toggle line with alt text {alt_text}" + body, building_element, building_content = close(body, building_element, building_content) + else: + body, building_element, building_content = close(body, building_element, building_content) + building_element = htmlgenerator.PRE + building_content = "" + case parser.PreformattedTextLine(text): + assert building_element == htmlgenerator.PRE + building_content += text + building_content += "\n" + case parser.TextLine(""): + if building_element: + body, building_element, building_content = close(body, building_element, building_content) + case parser.TextLine(text): + if building_element == htmlgenerator.P: + building_content += [htmlgenerator.BR(), text] + continue + elif building_element is not None and building_element != htmlgenerator.P: + body, building_element, building_content = close(body, building_element, building_content) + building_element = htmlgenerator.P + building_content = [text] + case parser.LinkLine(url, link_name): + if building_element == htmlgenerator.P: + building_content += [htmlgenerator.BR(), htmlgenerator.A(link_name, href=url)] + continue + elif building_element is not None and building_element != htmlgenerator.P: + body, building_element, building_content = close(body, building_element, building_content) + building_element = htmlgenerator.P + building_content = [htmlgenerator.A(link_name, href=url)] + case parser.ListItem(text): + if building_element == htmlgenerator.UL: + building_content.append(htmlgenerator.LI(text)) + continue + elif building_element is not None and building_element != htmlgenerator.UL: + body, building_element, building_content = close(body, building_element, building_content) + building_element = htmlgenerator.UL + building_content = [htmlgenerator.LI(text)] + case _: + assert False, f"unknown element {item}" + + close(body, building_element, building_content) + html = htmlgenerator.HTML( + htmlgenerator.HEAD(*head), + htmlgenerator.BODY(*body), + ) + return html + + +def pretty(s): + return etree.tostring(html.fromstring(s), pretty_print=True).decode("utf8") + + +def cli_to_html(): + import sys + input_ = sys.stdin.read() + gemtext = parser.parse(input_) + gemtext = list(gemtext) + html = to_html(gemtext) + rendered = htmlgenerator.render(html, {}) + rendered = pretty(rendered) + print(rendered) diff --git a/gemini-to-web/src/gemini_to_web/parser.py b/gemini-to-web/src/gemini_to_web/parser.py new file mode 100644 index 0000000..358cada --- /dev/null +++ b/gemini-to-web/src/gemini_to_web/parser.py @@ -0,0 +1,233 @@ +import dataclasses +from collections import abc +import typing + + +type GemElement = PreformattingToggleLine | PreformattedTextLine | LinkLine | ListItem | QuoteLine | HeadingLine | TextLine + + +def _test(gemtext: str): + return list(parse(gemtext.lstrip())) + + +def parse(gemtext: str) -> abc.Generator[GemElement]: + """ + >>> _test(''' + ... This is a test. + ... ''') + [TextLine(text='This is a test.')] + + >>> _test(''' + ... This is a test. + ... + ... => https://www.example.com/ Link + ... ''') + [TextLine(text='This is a test.'), TextLine(text=''), LinkLine(url='https://www.example.com/', link_name='Link')] + + >>> _test(''' + ... This is a test + ... + ... ```foo + ... bar + ... ``` + ... ''') + [TextLine(text='This is a test'), TextLine(text=''), PreformattingToggleLine(alt_text='foo'), PreformattedTextLine(text='bar'), PreformattingToggleLine(alt_text=None)] + + >>> _test(''' + ... # Welcome + ... + ... Something. + ... ''') + [HeadingLine(level=1, heading_text='Welcome'), TextLine(text=''), TextLine(text='Something.')] + + >>> _test(''' + ... This is a test. + ... + ... * Item 1 + ... * Item 2 + ... ''') + [TextLine(text='This is a test.'), TextLine(text=''), ListItem(text='Item 1'), ListItem(text='Item 2')] + + >>> _test(''' + ... This is a test. + ... + ... > Line 1 + ... > Line 2 + ... ''') + [TextLine(text='This is a test.'), TextLine(text=''), QuoteLine(text='Line 1'), QuoteLine(text='Line 2')] + """ + + current_preformatting_toggle_line = None + + for line in gemtext.splitlines(): + preformatting_toggle_line = PreformattingToggleLine.parse(line) + + if current_preformatting_toggle_line: + if preformatting_toggle_line: + assert not preformatting_toggle_line.alt_text, 'closing preformatting toggle line with alt text {preformatting_toggle_line.alt_text}' + current_preformatting_toggle_line = None + yield preformatting_toggle_line + continue + else: + yield PreformattedTextLine(line) + continue + + if preformatting_toggle_line: + current_preformatting_toggle_line = preformatting_toggle_line + yield preformatting_toggle_line + continue + elif current_preformatting_toggle_line and not preformatting_toggle_line: + current_preformatting_toggle_line = None + + link_line = LinkLine.parse(line) + if link_line: + yield link_line + continue + + list_item = ListItem.parse(line) + if list_item: + yield list_item + continue + + quote_line = QuoteLine.parse(line) + if quote_line: + yield quote_line + continue + + heading_line = HeadingLine.parse(line) + if heading_line: + yield heading_line + continue + + yield TextLine(line) + + +@dataclasses.dataclass +class TextLine: + text: str + + +@dataclasses.dataclass +class PreformattedTextLine(TextLine): + pass + +@dataclasses.dataclass +class LinkLine: + url: str + link_name: typing.Optional[str] = None + + @staticmethod + def parse(line: str): + """ + >>> LinkLine.parse('Not a link line') + + >>> LinkLine.parse('=>foo') + LinkLine(url='foo', link_name=None) + + >>> LinkLine.parse('=> foo') + LinkLine(url='foo', link_name=None) + + >>> LinkLine.parse('=> foo bar') + LinkLine(url='foo', link_name='bar') + """ + if not line.startswith('=>'): + return None + line = line.removeprefix('=>') + line = line.lstrip() + parts = line.split(maxsplit=1) + assert len(parts) in (1,2) + if len(parts) == 1: + return LinkLine(parts[0]) + return LinkLine(parts[0], parts[1]) + + +@dataclasses.dataclass +class PreformattingToggleLine: + alt_text: typing.Optional[str] + + @staticmethod + def parse(line: str): + """ + >>> PreformattingToggleLine.parse('Not a preformatting toggle line') + + >>> PreformattingToggleLine.parse('```') + PreformattingToggleLine(alt_text=None) + + >>> PreformattingToggleLine.parse('```alt') + PreformattingToggleLine(alt_text='alt') + """ + if not line.startswith('```'): + return None + line = line.removeprefix('```') + line: typing.Optional[str] = line if line else None + return PreformattingToggleLine(line) + + +@dataclasses.dataclass +class HeadingLine: + level: int + heading_text: str + + @staticmethod + def parse(line: str): + """ + >>> HeadingLine.parse('Not a heading line') + + >>> HeadingLine.parse('# Foo') + HeadingLine(level=1, heading_text='Foo') + """ + parts = line.split(maxsplit=1) + if len(parts) < 2: + return None + if not parts[0] in ('#', '##', '###'): + return None + return HeadingLine(len(parts[0]), parts[1]) + + +@dataclasses.dataclass +class ListItem: + text: str + + @staticmethod + def parse(line: str): + """ + >>> ListItem.parse('Not a list item') + + >>> ListItem.parse('* Foo') + ListItem(text='Foo') + """ + if not line.startswith("* "): + return None + return ListItem(line.removeprefix("* ")) + + +@dataclasses.dataclass +class QuoteLine: + text: str + + @staticmethod + def parse(line: str): + """ + >>> QuoteLine.parse('Not a quote line') + + >>> QuoteLine.parse('> Foo') + QuoteLine(text='Foo') + """ + if not line.startswith("> "): + return None + return QuoteLine(line.removeprefix("> ")) + + +def cli_parse(): + import json, sys + + def dataclass_json(obj): + assert dataclasses.is_dataclass(obj) + d = dataclasses.asdict(obj) + d["type"] = type(obj).__name__ + return d + + input_ = sys.stdin.read() + gemtext = parse(input_) + as_json = json.dumps(list(gemtext), default=dataclass_json) + print(as_json) -- cgit v1.2.3