markdown.py
Python script, ASCII text executable
1""" 2This is a parser for a Markdown-like language, but it isn't compatible with 3the CommonMark specification; check doc/enduser/Formatting messages.md for 4its syntax. 5 6Roundabout - git hosting for everyone <https://roundabout-host.com> 7Copyright (C) 2023-2025 Roundabout developers <root@roundabout-host.com> 8 9This program is free software: you can redistribute it and/or modify 10it under the terms of the GNU Affero General Public License as published by 11the Free Software Foundation, either version 3 of the License, or 12(at your option) any later version. 13 14This program is distributed in the hope that it will be useful, 15but WITHOUT ANY WARRANTY; without even the implied warranty of 16MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17GNU Affero General Public License for more details. 18 19You should have received a copy of the GNU Affero General Public License 20along with this program. If not, see <http://www.gnu.org/licenses/>. 21""" 22 23 24import re 25import bs4 as beautifulsoup 26import sys 27 28 29def only_chars(string, chars): 30chars = set(chars) 31all_chars = set(string) 32return all_chars.issubset(chars) 33 34 35inline_regex = r""" 36(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 37| 38<(?P<urlDestination2>[^<>]*)> # autolink 39| 40(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side 41| 42(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side 43| 44[``] (?P<textCode2>(?:\\[``]|[^``])*) [``] # inline code (2 backticks) 45| 46[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 47| 48(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 49| 50(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 51""" 52 53 54def leading(string, character): 55return len(string) - len(string.lstrip(character)) 56 57 58def trailing(string, character): 59return len(string) - len(string.rstrip(character)) 60 61 62class Element: 63def __init__(self): 64self.classes = [] 65self.content = None 66pass 67 68def __repr__(self): 69return "Void block" 70 71@property 72def tag_name(self): 73return "m-void" 74 75 76class Container(Element): 77def __init__(self, content): 78super().__init__() 79self.content = parse_line(content) 80 81def __repr__(self): 82return "Generic container element: " + repr(self.content) 83 84 85class Rule(Element): 86def __init__(self): 87super().__init__() 88 89def __repr__(self): 90return "Rule" 91 92@property 93def tag_name(self): 94return "hr" 95 96 97class HardBreak(Element): 98def __init__(self): 99super().__init__() 100 101def __repr__(self): 102return "Hard break" 103 104@property 105def tag_name(self): 106return "br" 107 108 109class Heading(Container): 110def __init__(self, content, level): 111super().__init__(content) 112self.level = level 113pass 114 115def __repr__(self): 116return f"Heading level {self.level}:\n\t" + repr(self.content) 117 118@property 119def tag_name(self): 120return "h" + str(self.level) 121 122 123class Paragraph(Container): 124def __init__(self, content): 125super().__init__("") 126self.content = parse_line(content) 127 128def __repr__(self): 129return "Paragraph:\n\t" + repr(self.content) 130 131@property 132def tag_name(self): 133return "p" 134 135 136class CodeBlock(Element): 137def __init__(self, content, language="text"): 138super().__init__() 139self.content = content 140self.language = language 141 142def __repr__(self): 143return f"Code block ({self.language}):\n\t" + repr(self.content) 144 145@property 146def tag_name(self): 147return "pre" 148 149 150class UnorderedList(Element): 151def __init__(self, content): 152super().__init__() 153self.content = content 154 155def __repr__(self): 156return "Unordered list:\n\t" + repr(self.content) 157 158@property 159def tag_name(self): 160return "ul" 161 162 163class OrderedList(Element): 164def __init__(self, content): 165super().__init__() 166self.content = content 167 168def __repr__(self): 169return "Ordered list:\n\t" + repr(self.content) 170 171@property 172def tag_name(self): 173return "ol" 174 175 176class ListItem(Element): 177def __init__(self, content): 178super().__init__() 179self.content = tokenise(content) 180 181def __repr__(self): 182return "List item:\n\t" + repr(self.content) 183 184@property 185def tag_name(self): 186return "li" 187 188 189class Blockquote(Paragraph): 190def __init__(self, content): 191super().__init__("") 192self.content = tokenise(content) 193 194def __repr__(self): 195return "Blockquote:\n\t" + repr(self.content) 196 197@property 198def tag_name(self): 199return "blockquote" 200 201 202class Emphasis(Container): 203def __init__(self, content, value): 204super().__init__(content) 205self.value = value 206if value >= 4: 207self.classes.append("emphasis-3") 208if value % 4 >= 2: 209self.classes.append("emphasis-2") 210if value % 2: 211self.classes.append("emphasis-1") 212 213def __repr__(self): 214return f"Emphasis ({self.value}): " + repr(self.content) 215 216@property 217def tag_name(self): 218return "em" if self.value == 1 else "strong" 219 220 221class Code(Element): 222def __init__(self, content): 223super().__init__() 224self.content = [content] 225 226def __repr__(self): 227return f"Inline code: {self.content}" 228 229@property 230def tag_name(self): 231return "code" 232 233 234class Strikethrough(Container): 235def __init__(self, content): 236super().__init__(content) 237 238def __repr__(self): 239return f"Strikethrough: {repr(self.content)}" 240 241@property 242def tag_name(self): 243return "s" 244 245 246class Diff(Container): 247def __init__(self, content, value): 248super().__init__(content) 249self.value = value 250 251def __repr__(self): 252return f"Diff ({self.value}): {self.content}" 253 254@property 255def tag_name(self): 256return "ins" if self.value == "++" else "del" 257 258 259class Link(Element): 260def __init__(self, content, destination, image=False): 261super().__init__() 262self.content = parse_line(content) 263self.destination = destination 264self.image = image 265 266def __repr__(self): 267return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 268 269@property 270def tag_name(self): 271return "a" 272 273 274class Image(Link): 275def __init__(self, text, destination): 276super().__init__(text, destination, True) 277 278@property 279def tag_name(self): 280return "img" 281 282 283def parse_line(source): 284if trailing(source, "\\") == 1: 285source = source.rstrip("\\") 286hard_break = True 287else: 288hard_break = False 289 290tokens = [] 291pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE) 292matches = pattern.finditer(source) 293 294lookup = 0 295for i in matches: 296l = i.start() 297r = i.end() 298tokens.append(source[lookup:l]) 299 300lookup = r 301 302if i.group("em"): 303tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 304if i.group("em2"): 305tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2")))) 306if i.group("textCode"): 307tokens.append(Code(i.group("textCode"))) 308if i.group("textCode2"): 309tokens.append(Code(i.group("textCode2"))) 310if i.group("strike"): 311tokens.append(Strikethrough(i.group("textStrike"))) 312if i.group("diff"): 313tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 314if i.group("urlText"): 315if i.group("imageFlag"): 316tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 317else: 318tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 319if i.group("urlDestination2"): 320if "://" not in i.group("urlDestination2"): 321url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes 322url_destination = i.group("urlDestination2") 323if url_destination.startswith("mailto:"): 324url_destination = url_destination.replace("@", "@") # prevent email harvesting 325url_text = url_text.replace("@", "@") # prevent protocol injection 326else: 327url_text = url_destination = i.group("urlDestination2") 328 329tokens.append(Link(url_text, url_destination)) 330 331tokens.append(source[lookup:]) 332 333if hard_break: 334tokens.append(HardBreak()) 335 336return tokens 337 338 339def tokenise(source): 340tokens = [] 341 342current_block = Element() 343 344lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces 345 346i = 0 347while i < len(lines): 348line = lines[i] 349if not line.strip() or line.startswith(";"): 350# Void block 351 352tokens.append(current_block) 353current_block = Element() 354 355i += 1 356elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3: 357# Horizontal rule 358 359tokens.append(current_block) 360current_block = Rule() 361 362i += 1 363elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "): 364if not isinstance(current_block, UnorderedList): 365tokens.append(current_block) 366 367content = [] 368 369while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 370inner_content = lines[i][2:].strip() + "\n" # discard marker and space 371i += 1 372while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "): 373inner_content += lines[i] + "\n" 374i += 1 375 376content.append(ListItem(inner_content)) 377 378current_block = UnorderedList(content) 379elif re.match(r"^\d+\.", line): 380if not isinstance(current_block, UnorderedList): 381tokens.append(current_block) 382 383content = [] 384 385while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1: 386inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period 387i += 1 388marker_length = len(lines[i].split(".", 1)[0]) + 1 389while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]): 390inner_content += lines[i][2:] + "\n" 391i += 1 392 393content.append(ListItem(inner_content)) 394 395current_block = OrderedList(content) 396elif line.startswith("#") and leading(line.lstrip("#"), " "): 397tokens.append(current_block) 398 399content = line.lstrip("#").strip() 400current_block = Heading(content, leading(line, "#")) 401 402i += 1 403elif line.startswith(">"): 404if not isinstance(current_block, Blockquote): 405tokens.append(current_block) 406 407content = "" 408 409while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3): 410content += lines[i].lstrip(">") + "\n" 411i += 1 412 413current_block = Blockquote(content) 414elif leading(line, "~") == 3 or leading(line, "`") == 3: 415if not isinstance(current_block, CodeBlock): 416tokens.append(current_block) 417 418language = line.lstrip("`~").strip() 419 420content = "" 421i += 1 # skip the opening fence 422while i < len(lines) and not lines[i].strip() in ("```", "~~~"): 423content += lines[i] + "\n" 424i += 1 425 426if i < len(lines): 427i += 1 # prevent a new block from beginning with the closing fence 428 429current_block = CodeBlock(content, language=language) 430elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip(): 431tokens.append(current_block) 432 433content = line.strip() 434current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2) 435 436i += 2 437else: 438if not isinstance(current_block, Paragraph): 439# Create a paragraph, if there is no other specifier 440tokens.append(current_block) 441 442content = "" 443 444while (i < len(lines) 445and not lines[i].startswith("#") 446and not lines[i].startswith(">") 447and not lines[i].startswith(";") 448and not lines[i].startswith("* ") 449and not lines[i].startswith("+ ") 450and not lines[i].startswith("- ") 451and not lines[i].startswith("~~~") 452and not lines[i].startswith("```") 453and not re.match(r"^\d+\.", lines[i]) 454and lines[i].strip()): 455content += lines[i].strip() + "\n" 456i += 1 457 458current_block = Paragraph(content) 459 460tokens.append(current_block) 461 462return tokens 463 464 465def make_html(ast): 466soup = beautifulsoup.BeautifulSoup() 467for i in ast: 468# Use bs4 to generate HTML 469if isinstance(i, str): 470soup.append(i) 471elif hasattr(i, "content") and i.tag_name != "m-void": 472tag = soup.new_tag(str(i.tag_name)) 473if i.tag_name == "a": 474tag["href"] = i.destination 475if i.tag_name == "img": 476tag["src"] = i.destination 477tag["alt"] = " ".join(i.content) 478if i.tag_name == "pre": 479tag["data-language"] = i.language 480if i.classes: 481tag["class"] = " ".join(i.classes) 482try: 483if isinstance(i.content, list): 484tag.append(make_html(i.content)) 485elif i.content and i.tag_name != "img": 486tag.string = i.content 487 488if i.tag_name == "img": 489tag.string = "" 490except AttributeError as exc: 491# print(i) 492print(exc, file=sys.stderr) 493soup.append(tag) 494return soup 495 496 497def markdown2html(markdown): 498return make_html(tokenise(markdown)) 499 500