From 7f5e2cdc0b81d46fd6255ebae8cf4f6c92c2030d Mon Sep 17 00:00:00 2001 From: alex Date: Sat, 14 Feb 2026 21:23:16 +0100 Subject: Import initial gemini-to-web --- gemini-to-web/src/gemini_to_web/parser.py | 233 ++++++++++++++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 gemini-to-web/src/gemini_to_web/parser.py (limited to 'gemini-to-web/src/gemini_to_web/parser.py') diff --git a/gemini-to-web/src/gemini_to_web/parser.py b/gemini-to-web/src/gemini_to_web/parser.py new file mode 100644 index 0000000..358cada --- /dev/null +++ b/gemini-to-web/src/gemini_to_web/parser.py @@ -0,0 +1,233 @@ +import dataclasses +from collections import abc +import typing + + +type GemElement = PreformattingToggleLine | PreformattedTextLine | LinkLine | ListItem | QuoteLine | HeadingLine | TextLine + + +def _test(gemtext: str): + return list(parse(gemtext.lstrip())) + + +def parse(gemtext: str) -> abc.Generator[GemElement]: + """ + >>> _test(''' + ... This is a test. + ... ''') + [TextLine(text='This is a test.')] + + >>> _test(''' + ... This is a test. + ... + ... => https://www.example.com/ Link + ... ''') + [TextLine(text='This is a test.'), TextLine(text=''), LinkLine(url='https://www.example.com/', link_name='Link')] + + >>> _test(''' + ... This is a test + ... + ... ```foo + ... bar + ... ``` + ... ''') + [TextLine(text='This is a test'), TextLine(text=''), PreformattingToggleLine(alt_text='foo'), PreformattedTextLine(text='bar'), PreformattingToggleLine(alt_text=None)] + + >>> _test(''' + ... # Welcome + ... + ... Something. + ... ''') + [HeadingLine(level=1, heading_text='Welcome'), TextLine(text=''), TextLine(text='Something.')] + + >>> _test(''' + ... This is a test. + ... + ... * Item 1 + ... * Item 2 + ... ''') + [TextLine(text='This is a test.'), TextLine(text=''), ListItem(text='Item 1'), ListItem(text='Item 2')] + + >>> _test(''' + ... This is a test. + ... + ... > Line 1 + ... > Line 2 + ... ''') + [TextLine(text='This is a test.'), TextLine(text=''), QuoteLine(text='Line 1'), QuoteLine(text='Line 2')] + """ + + current_preformatting_toggle_line = None + + for line in gemtext.splitlines(): + preformatting_toggle_line = PreformattingToggleLine.parse(line) + + if current_preformatting_toggle_line: + if preformatting_toggle_line: + assert not preformatting_toggle_line.alt_text, 'closing preformatting toggle line with alt text {preformatting_toggle_line.alt_text}' + current_preformatting_toggle_line = None + yield preformatting_toggle_line + continue + else: + yield PreformattedTextLine(line) + continue + + if preformatting_toggle_line: + current_preformatting_toggle_line = preformatting_toggle_line + yield preformatting_toggle_line + continue + elif current_preformatting_toggle_line and not preformatting_toggle_line: + current_preformatting_toggle_line = None + + link_line = LinkLine.parse(line) + if link_line: + yield link_line + continue + + list_item = ListItem.parse(line) + if list_item: + yield list_item + continue + + quote_line = QuoteLine.parse(line) + if quote_line: + yield quote_line + continue + + heading_line = HeadingLine.parse(line) + if heading_line: + yield heading_line + continue + + yield TextLine(line) + + +@dataclasses.dataclass +class TextLine: + text: str + + +@dataclasses.dataclass +class PreformattedTextLine(TextLine): + pass + +@dataclasses.dataclass +class LinkLine: + url: str + link_name: typing.Optional[str] = None + + @staticmethod + def parse(line: str): + """ + >>> LinkLine.parse('Not a link line') + + >>> LinkLine.parse('=>foo') + LinkLine(url='foo', link_name=None) + + >>> LinkLine.parse('=> foo') + LinkLine(url='foo', link_name=None) + + >>> LinkLine.parse('=> foo bar') + LinkLine(url='foo', link_name='bar') + """ + if not line.startswith('=>'): + return None + line = line.removeprefix('=>') + line = line.lstrip() + parts = line.split(maxsplit=1) + assert len(parts) in (1,2) + if len(parts) == 1: + return LinkLine(parts[0]) + return LinkLine(parts[0], parts[1]) + + +@dataclasses.dataclass +class PreformattingToggleLine: + alt_text: typing.Optional[str] + + @staticmethod + def parse(line: str): + """ + >>> PreformattingToggleLine.parse('Not a preformatting toggle line') + + >>> PreformattingToggleLine.parse('```') + PreformattingToggleLine(alt_text=None) + + >>> PreformattingToggleLine.parse('```alt') + PreformattingToggleLine(alt_text='alt') + """ + if not line.startswith('```'): + return None + line = line.removeprefix('```') + line: typing.Optional[str] = line if line else None + return PreformattingToggleLine(line) + + +@dataclasses.dataclass +class HeadingLine: + level: int + heading_text: str + + @staticmethod + def parse(line: str): + """ + >>> HeadingLine.parse('Not a heading line') + + >>> HeadingLine.parse('# Foo') + HeadingLine(level=1, heading_text='Foo') + """ + parts = line.split(maxsplit=1) + if len(parts) < 2: + return None + if not parts[0] in ('#', '##', '###'): + return None + return HeadingLine(len(parts[0]), parts[1]) + + +@dataclasses.dataclass +class ListItem: + text: str + + @staticmethod + def parse(line: str): + """ + >>> ListItem.parse('Not a list item') + + >>> ListItem.parse('* Foo') + ListItem(text='Foo') + """ + if not line.startswith("* "): + return None + return ListItem(line.removeprefix("* ")) + + +@dataclasses.dataclass +class QuoteLine: + text: str + + @staticmethod + def parse(line: str): + """ + >>> QuoteLine.parse('Not a quote line') + + >>> QuoteLine.parse('> Foo') + QuoteLine(text='Foo') + """ + if not line.startswith("> "): + return None + return QuoteLine(line.removeprefix("> ")) + + +def cli_parse(): + import json, sys + + def dataclass_json(obj): + assert dataclasses.is_dataclass(obj) + d = dataclasses.asdict(obj) + d["type"] = type(obj).__name__ + return d + + input_ = sys.stdin.read() + gemtext = parse(input_) + as_json = json.dumps(list(gemtext), default=dataclass_json) + print(as_json) -- cgit v1.2.3