markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5 6def only_chars(string, chars): 7chars = set(chars) 8all_chars = set(string) 9return all_chars.issubset(chars) 10 11 12inlineRegex = r""" 13(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis 14| 15[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 16| 17(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 18| 19(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 20| 21(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 22""" 23 24 25def leading(string, character): 26return len(string) - len(string.lstrip(character)) 27 28 29def trailing(string, character): 30return len(string) - len(string.rstrip(character)) 31 32 33class Element: 34def __init__(self): 35self.classes = [] 36self.content = None 37pass 38 39def __repr__(self): 40return "Void block" 41 42@property 43def tag_name(self): 44return "m-void" 45 46 47class Container(Element): 48def __init__(self, content): 49super().__init__() 50self.content = parse_line(content) 51 52def __repr__(self): 53return "Generic container element: " + repr(self.content) 54 55 56class Rule(Element): 57def __init__(self): 58super().__init__() 59 60def __repr__(self): 61return "Rule" 62 63@property 64def tag_name(self): 65return "hr" 66 67 68class HardBreak(Element): 69def __init__(self): 70super().__init__() 71 72def __repr__(self): 73return "Hard break" 74 75@property 76def tag_name(self): 77return "br" 78 79 80class Heading(Container): 81def __init__(self, content, level): 82super().__init__(content) 83self.level = level 84pass 85 86def __repr__(self): 87return f"Heading level {self.level}:\n\t" + repr(self.content) 88 89@property 90def tag_name(self): 91return "h" + str(self.level) 92 93 94class Paragraph(Container): 95def __init__(self, content): 96super().__init__("") 97self.content = parse_line(content) 98 99def __repr__(self): 100return "Paragraph:\n\t" + repr(self.content) 101 102@property 103def tag_name(self): 104return "p" 105 106 107class CodeBlock(Element): 108def __init__(self, content, language="text"): 109super().__init__() 110self.content = content 111self.language = language 112 113def __repr__(self): 114return f"Code block ({self.language}):\n\t" + repr(self.content) 115 116@property 117def tag_name(self): 118return "pre" 119 120 121class UnorderedList(Element): 122def __init__(self, content): 123super().__init__() 124self.content = content 125 126def __repr__(self): 127return "Unordered list:\n\t" + repr(self.content) 128 129@property 130def tag_name(self): 131return "ul" 132 133 134class OrderedList(Element): 135def __init__(self, content): 136super().__init__() 137self.content = content 138 139def __repr__(self): 140return "Ordered list:\n\t" + repr(self.content) 141 142@property 143def tag_name(self): 144return "ol" 145 146 147class ListItem(Element): 148def __init__(self, content): 149super().__init__() 150self.content = tokenise(content) 151 152def __repr__(self): 153return "List item:\n\t" + repr(self.content) 154 155@property 156def tag_name(self): 157return "li" 158 159 160class Blockquote(Paragraph): 161def __init__(self, content): 162super().__init__("") 163self.content = tokenise(content) 164 165def __repr__(self): 166return "Blockquote:\n\t" + repr(self.content) 167 168@property 169def tag_name(self): 170return "blockquote" 171 172 173class Emphasis(Container): 174def __init__(self, content, value): 175super().__init__(content) 176self.value = value 177if value >= 4: 178self.classes.append("emphasis-3") 179if value % 4 >= 2: 180self.classes.append("emphasis-2") 181if value % 2: 182self.classes.append("emphasis-1") 183 184def __repr__(self): 185return f"Emphasis ({self.value}): " + repr(self.content) 186 187@property 188def tag_name(self): 189return "em" if self.value == 1 else "strong" 190 191 192class Code(Element): 193def __init__(self, content): 194super().__init__() 195self.content = [content] 196 197def __repr__(self): 198return f"Inline code: {self.content}" 199 200@property 201def tag_name(self): 202return "code" 203 204 205class Strikethrough(Container): 206def __init__(self, content): 207super().__init__(content) 208 209def __repr__(self): 210return f"Strikethrough: {repr(self.content)}" 211 212@property 213def tag_name(self): 214return "s" 215 216 217class Diff(Container): 218def __init__(self, content, value): 219super().__init__(content) 220self.value = value 221 222def __repr__(self): 223return f"Diff ({self.value}): {self.content}" 224 225@property 226def tag_name(self): 227return "ins" if self.value == "++" else "del" 228 229 230class Link(Element): 231def __init__(self, content, destination, image=False): 232super().__init__() 233self.content = content 234self.destination = destination 235self.image = image 236 237def __repr__(self): 238return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 239 240@property 241def tag_name(self): 242return "a" 243 244 245class Image(Link): 246def __init__(self, text, destination): 247super().__init__(text, destination, True) 248 249@property 250def tag_name(self): 251return "img" 252 253 254def parse_line(source): 255if trailing(source, "\\") == 1: 256source = source.rstrip("\\") 257hard_break = True 258else: 259hard_break = False 260 261tokens = [] 262pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE) 263matches = pattern.finditer(source) 264 265lookup = 0 266for i in matches: 267l = i.start() 268r = i.end() 269tokens.append(source[lookup:l]) 270 271lookup = r 272 273if i.group("em"): 274tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 275if i.group("textCode"): 276tokens.append(Code(i.group("textCode"))) 277if i.group("strike"): 278tokens.append(Strikethrough(i.group("textStrike"))) 279if i.group("diff"): 280tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 281if i.group("urlText"): 282if i.group("imageFlag"): 283tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 284else: 285tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 286 287tokens.append(source[lookup:]) 288 289if hard_break: 290tokens.append(HardBreak()) 291 292return tokens 293 294 295def tokenise(source): 296tokens = [] 297 298current_block = Element() 299 300lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces 301 302i = 0 303while i < len(lines): 304line = lines[i] 305if not line.strip(): 306# Void block 307 308tokens.append(current_block) 309current_block = Element() 310 311i += 1 312elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3: 313# Horizontal rule 314 315tokens.append(current_block) 316current_block = Rule() 317 318i += 1 319elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "): 320if not isinstance(current_block, UnorderedList): 321tokens.append(current_block) 322 323content = [] 324 325while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 326inner_content = lines[i][2:].strip() + "\n" # discard marker and space 327i += 1 328while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 329inner_content += lines[i][1:] + "\n" 330i += 1 331 332content.append(ListItem(inner_content)) 333 334current_block = UnorderedList(content) 335elif re.match(r"^\d+\.", line): 336if not isinstance(current_block, UnorderedList): 337tokens.append(current_block) 338 339content = [] 340 341while i < len(lines) and re.match(r"^\d+\.", line) and len(lines[i].split(".", 1)) > 1: 342inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period 343i += 1 344while i < len(lines) and lines[i].strip() and not re.match(r"^\d+\.", line): 345inner_content += lines[i] + "\n" 346i += 1 347 348content.append(ListItem(inner_content)) 349 350current_block = OrderedList(content) 351elif line.startswith("#") and leading(line.lstrip("#"), " "): 352tokens.append(current_block) 353 354content = line.lstrip("#").strip() 355current_block = Heading(content, leading(line, "#")) 356 357i += 1 358elif line.startswith(">"): 359if not isinstance(current_block, Blockquote): 360tokens.append(current_block) 361 362content = "" 363 364while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3): 365content += lines[i].lstrip(">") + "\n" 366i += 1 367 368current_block = Blockquote(content) 369elif leading(line, "~") == 3 or leading(line, "`") == 3: 370if not isinstance(current_block, CodeBlock): 371tokens.append(current_block) 372 373language = line.lstrip("`~").strip() 374 375content = "" 376i += 1 # skip the opening fence 377while i < len(lines) and not lines[i].strip() in ("```", "~~~"): 378content += lines[i] + "\n" 379i += 1 380 381if i < len(lines): 382i += 1 # prevent a new block from beginning with the closing fence 383 384current_block = CodeBlock(content, language=language) 385elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip(): 386tokens.append(current_block) 387 388content = line.strip() 389current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2) 390 391i += 2 392else: 393if not isinstance(current_block, Paragraph): 394# Create a paragraph, if there is no other specifier 395tokens.append(current_block) 396 397content = "" 398 399while (i < len(lines) 400and not lines[i].startswith("#") 401and not lines[i].startswith(">") 402and not lines[i].startswith("* ") 403and not lines[i].startswith("+ ") 404and not lines[i].startswith("- ") 405and not lines[i].startswith("~~~") 406and not lines[i].startswith("```") 407and not re.match(r"^\d+\.", lines[i]) 408and lines[i].strip()): 409content += lines[i].strip() + "\n" 410i += 1 411 412current_block = Paragraph(content) 413 414tokens.append(current_block) 415 416return tokens 417 418 419def make_html(ast): 420soup = beautifulsoup.BeautifulSoup() 421for i in ast: 422# Use bs4 to generate HTML 423if isinstance(i, str): 424soup.append(i) 425elif hasattr(i, "content") and i.tag_name != "m-void": 426tag = soup.new_tag(str(i.tag_name)) 427if i.tag_name == "a": 428tag["href"] = i.destination 429if i.tag_name == "img": 430tag["src"] = i.destination 431if i.tag_name == "pre": 432tag["data-language"] = i.language 433if i.classes: 434tag["class"] = " ".join(i.classes) 435try: 436if isinstance(i.content, list): 437tag.append(make_html(i.content)) 438elif i.content: 439tag.string = i.content 440except AttributeError as exc: 441# print(i) 442print(exc, file=sys.stderr) 443... 444soup.append(tag) 445return soup 446 447 448if __name__ == '__main__': 449with open("/home/vlad/roundabout/doc/changelog/0.1.0 (2024-03-31).md") as file: 450# Generate an AST from a markdown file 451ast = tokenise( 452file.read() 453) 454# for i in ast: 455# print(repr(i)) 456 457# Now convert the AST to HTML 458print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4))) 459 460