markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5 6def only_chars(string, chars): 7chars = set(chars) 8all_chars = set(string) 9return all_chars.issubset(chars) 10 11 12inlineRegex = r""" 13(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 14| 15(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side 16| 17(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side 18| 19[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 20| 21(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 22| 23(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 24""" 25 26 27def leading(string, character): 28return len(string) - len(string.lstrip(character)) 29 30 31def trailing(string, character): 32return len(string) - len(string.rstrip(character)) 33 34 35class Element: 36def __init__(self): 37self.classes = [] 38self.content = None 39pass 40 41def __repr__(self): 42return "Void block" 43 44@property 45def tag_name(self): 46return "m-void" 47 48 49class Container(Element): 50def __init__(self, content): 51super().__init__() 52self.content = parse_line(content) 53 54def __repr__(self): 55return "Generic container element: " + repr(self.content) 56 57 58class Rule(Element): 59def __init__(self): 60super().__init__() 61 62def __repr__(self): 63return "Rule" 64 65@property 66def tag_name(self): 67return "hr" 68 69 70class HardBreak(Element): 71def __init__(self): 72super().__init__() 73 74def __repr__(self): 75return "Hard break" 76 77@property 78def tag_name(self): 79return "br" 80 81 82class Heading(Container): 83def __init__(self, content, level): 84super().__init__(content) 85self.level = level 86pass 87 88def __repr__(self): 89return f"Heading level {self.level}:\n\t" + repr(self.content) 90 91@property 92def tag_name(self): 93return "h" + str(self.level) 94 95 96class Paragraph(Container): 97def __init__(self, content): 98super().__init__("") 99self.content = parse_line(content) 100 101def __repr__(self): 102return "Paragraph:\n\t" + repr(self.content) 103 104@property 105def tag_name(self): 106return "p" 107 108 109class CodeBlock(Element): 110def __init__(self, content, language="text"): 111super().__init__() 112self.content = content 113self.language = language 114 115def __repr__(self): 116return f"Code block ({self.language}):\n\t" + repr(self.content) 117 118@property 119def tag_name(self): 120return "pre" 121 122 123class UnorderedList(Element): 124def __init__(self, content): 125super().__init__() 126self.content = content 127 128def __repr__(self): 129return "Unordered list:\n\t" + repr(self.content) 130 131@property 132def tag_name(self): 133return "ul" 134 135 136class OrderedList(Element): 137def __init__(self, content): 138super().__init__() 139self.content = content 140 141def __repr__(self): 142return "Ordered list:\n\t" + repr(self.content) 143 144@property 145def tag_name(self): 146return "ol" 147 148 149class ListItem(Element): 150def __init__(self, content): 151super().__init__() 152self.content = tokenise(content) 153 154def __repr__(self): 155return "List item:\n\t" + repr(self.content) 156 157@property 158def tag_name(self): 159return "li" 160 161 162class Blockquote(Paragraph): 163def __init__(self, content): 164super().__init__("") 165self.content = tokenise(content) 166 167def __repr__(self): 168return "Blockquote:\n\t" + repr(self.content) 169 170@property 171def tag_name(self): 172return "blockquote" 173 174 175class Emphasis(Container): 176def __init__(self, content, value): 177super().__init__(content) 178self.value = value 179if value >= 4: 180self.classes.append("emphasis-3") 181if value % 4 >= 2: 182self.classes.append("emphasis-2") 183if value % 2: 184self.classes.append("emphasis-1") 185 186def __repr__(self): 187return f"Emphasis ({self.value}): " + repr(self.content) 188 189@property 190def tag_name(self): 191return "em" if self.value == 1 else "strong" 192 193 194class Code(Element): 195def __init__(self, content): 196super().__init__() 197self.content = [content] 198 199def __repr__(self): 200return f"Inline code: {self.content}" 201 202@property 203def tag_name(self): 204return "code" 205 206 207class Strikethrough(Container): 208def __init__(self, content): 209super().__init__(content) 210 211def __repr__(self): 212return f"Strikethrough: {repr(self.content)}" 213 214@property 215def tag_name(self): 216return "s" 217 218 219class Diff(Container): 220def __init__(self, content, value): 221super().__init__(content) 222self.value = value 223 224def __repr__(self): 225return f"Diff ({self.value}): {self.content}" 226 227@property 228def tag_name(self): 229return "ins" if self.value == "++" else "del" 230 231 232class Link(Element): 233def __init__(self, content, destination, image=False): 234super().__init__() 235self.content = content 236self.destination = destination 237self.image = image 238 239def __repr__(self): 240return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 241 242@property 243def tag_name(self): 244return "a" 245 246 247class Image(Link): 248def __init__(self, text, destination): 249super().__init__(text, destination, True) 250 251@property 252def tag_name(self): 253return "img" 254 255 256def parse_line(source): 257if trailing(source, "\\") == 1: 258source = source.rstrip("\\") 259hard_break = True 260else: 261hard_break = False 262 263tokens = [] 264pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE) 265matches = pattern.finditer(source) 266 267lookup = 0 268for i in matches: 269l = i.start() 270r = i.end() 271tokens.append(source[lookup:l]) 272 273lookup = r 274 275if i.group("em"): 276tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 277if i.group("em2"): 278tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2")))) 279if i.group("textCode"): 280tokens.append(Code(i.group("textCode"))) 281if i.group("strike"): 282tokens.append(Strikethrough(i.group("textStrike"))) 283if i.group("diff"): 284tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 285if i.group("urlText"): 286if i.group("imageFlag"): 287tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 288else: 289tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 290 291tokens.append(source[lookup:]) 292 293if hard_break: 294tokens.append(HardBreak()) 295 296return tokens 297 298 299def tokenise(source): 300tokens = [] 301 302current_block = Element() 303 304lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces 305 306i = 0 307while i < len(lines): 308line = lines[i] 309if not line.strip(): 310# Void block 311 312tokens.append(current_block) 313current_block = Element() 314 315i += 1 316elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3: 317# Horizontal rule 318 319tokens.append(current_block) 320current_block = Rule() 321 322i += 1 323elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "): 324if not isinstance(current_block, UnorderedList): 325tokens.append(current_block) 326 327content = [] 328 329while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 330inner_content = lines[i][2:].strip() + "\n" # discard marker and space 331i += 1 332while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 333inner_content += lines[i][1:] + "\n" 334i += 1 335 336content.append(ListItem(inner_content)) 337 338current_block = UnorderedList(content) 339elif re.match(r"^\d+\.", line): 340if not isinstance(current_block, UnorderedList): 341tokens.append(current_block) 342 343content = [] 344 345while i < len(lines) and re.match(r"^\d+\.", line) and len(lines[i].split(".", 1)) > 1: 346inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period 347i += 1 348while i < len(lines) and lines[i].strip() and not re.match(r"^\d+\.", line): 349inner_content += lines[i] + "\n" 350i += 1 351 352content.append(ListItem(inner_content)) 353 354current_block = OrderedList(content) 355elif line.startswith("#") and leading(line.lstrip("#"), " "): 356tokens.append(current_block) 357 358content = line.lstrip("#").strip() 359current_block = Heading(content, leading(line, "#")) 360 361i += 1 362elif line.startswith(">"): 363if not isinstance(current_block, Blockquote): 364tokens.append(current_block) 365 366content = "" 367 368while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3): 369content += lines[i].lstrip(">") + "\n" 370i += 1 371 372current_block = Blockquote(content) 373elif leading(line, "~") == 3 or leading(line, "`") == 3: 374if not isinstance(current_block, CodeBlock): 375tokens.append(current_block) 376 377language = line.lstrip("`~").strip() 378 379content = "" 380i += 1 # skip the opening fence 381while i < len(lines) and not lines[i].strip() in ("```", "~~~"): 382content += lines[i] + "\n" 383i += 1 384 385if i < len(lines): 386i += 1 # prevent a new block from beginning with the closing fence 387 388current_block = CodeBlock(content, language=language) 389elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip(): 390tokens.append(current_block) 391 392content = line.strip() 393current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2) 394 395i += 2 396else: 397if not isinstance(current_block, Paragraph): 398# Create a paragraph, if there is no other specifier 399tokens.append(current_block) 400 401content = "" 402 403while (i < len(lines) 404and not lines[i].startswith("#") 405and not lines[i].startswith(">") 406and not lines[i].startswith("* ") 407and not lines[i].startswith("+ ") 408and not lines[i].startswith("- ") 409and not lines[i].startswith("~~~") 410and not lines[i].startswith("```") 411and not re.match(r"^\d+\.", lines[i]) 412and lines[i].strip()): 413content += lines[i].strip() + "\n" 414i += 1 415 416current_block = Paragraph(content) 417 418tokens.append(current_block) 419 420return tokens 421 422 423def make_html(ast): 424soup = beautifulsoup.BeautifulSoup() 425for i in ast: 426# Use bs4 to generate HTML 427if isinstance(i, str): 428soup.append(i) 429elif hasattr(i, "content") and i.tag_name != "m-void": 430tag = soup.new_tag(str(i.tag_name)) 431if i.tag_name == "a": 432tag["href"] = i.destination 433if i.tag_name == "img": 434tag["src"] = i.destination 435if i.tag_name == "pre": 436tag["data-language"] = i.language 437if i.classes: 438tag["class"] = " ".join(i.classes) 439try: 440if isinstance(i.content, list): 441tag.append(make_html(i.content)) 442elif i.content: 443tag.string = i.content 444except AttributeError as exc: 445# print(i) 446print(exc, file=sys.stderr) 447... 448soup.append(tag) 449return soup 450 451 452if __name__ == '__main__': 453with open("/home/vlad/roundabout/doc/changelog/0.1.0 (2024-03-31).md") as file: 454# Generate an AST from a markdown file 455ast = tokenise( 456""" 457_test_ 458 459and this is another _test_ by the way 460""" 461) 462# for i in ast: 463# print(repr(i)) 464 465# Now convert the AST to HTML 466print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4))) 467 468