markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5 6def only_chars(string, chars): 7chars = set(chars) 8all_chars = set(string) 9return all_chars.issubset(chars) 10 11 12inlineRegex = r""" 13(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis 14| 15[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 16| 17(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 18| 19(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 20| 21(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 22""" 23 24 25def leading(string, character): 26return len(string) - len(string.lstrip(character)) 27 28 29def trailing(string, character): 30return len(string) - len(string.rstrip(character)) 31 32 33class Element: 34def __init__(self): 35self.classes = [] 36self.content = None 37pass 38 39def __repr__(self): 40return "Void block" 41 42@property 43def tag_name(self): 44return "m-void" 45 46 47class Container(Element): 48def __init__(self, content): 49super().__init__() 50self.content = parse_line(content) 51 52def __repr__(self): 53return "Generic container element: " + repr(self.content) 54 55 56class Rule(Element): 57def __init__(self): 58super().__init__() 59 60def __repr__(self): 61return "Rule" 62 63@property 64def tag_name(self): 65return "hr" 66 67 68class HardBreak(Element): 69def __init__(self): 70super().__init__() 71 72def __repr__(self): 73return "Hard break" 74 75@property 76def tag_name(self): 77return "br" 78 79 80class Heading(Container): 81def __init__(self, content, level): 82super().__init__(content) 83self.level = level 84pass 85 86def __repr__(self): 87return f"Heading level {self.level}:\n\t" + repr(self.content) 88 89@property 90def tag_name(self): 91return "h" + str(self.level) 92 93 94class Paragraph(Container): 95def __init__(self, content): 96super().__init__("") 97self.content = parse_line(content) 98 99def __repr__(self): 100return "Paragraph:\n\t" + repr(self.content) 101 102@property 103def tag_name(self): 104return "p" 105 106 107class CodeBlock(Element): 108def __init__(self, content, language="text"): 109super().__init__() 110self.content = content 111self.language = language 112 113def __repr__(self): 114return f"Code block ({self.language}):\n\t" + repr(self.content) 115 116@property 117def tag_name(self): 118return "pre" 119 120 121class UnorderedList(Element): 122def __init__(self, content): 123super().__init__() 124self.content = content 125 126def __repr__(self): 127return "Unordered list:\n\t" + repr(self.content) 128 129@property 130def tag_name(self): 131return "ul" 132 133 134class OrderedList(Element): 135def __init__(self, content): 136super().__init__() 137self.content = content 138 139def __repr__(self): 140return "Ordered list:\n\t" + repr(self.content) 141 142@property 143def tag_name(self): 144return "ol" 145 146 147class ListItem(Paragraph): 148def __init__(self, content): 149super().__init__("") 150self.content = tokenise(content) 151 152def __repr__(self): 153return "List item:\n\t" + repr(self.content) 154 155@property 156def tag_name(self): 157return "li" 158 159 160class Blockquote(Paragraph): 161def __init__(self, content): 162super().__init__("") 163self.content = tokenise(content) 164 165def __repr__(self): 166return "Blockquote:\n\t" + repr(self.content) 167 168@property 169def tag_name(self): 170return "blockquote" 171 172 173class Emphasis(Container): 174def __init__(self, content, value): 175super().__init__(content) 176self.value = value 177if value >= 4: 178self.classes.append("emphasis-3") 179if value % 4 >= 2: 180self.classes.append("emphasis-2") 181if value % 2: 182self.classes.append("emphasis-1") 183 184def __repr__(self): 185return f"Emphasis ({self.value}): " + repr(self.content) 186 187@property 188def tag_name(self): 189return "em" if self.value == 1 else "strong" 190 191 192class Code(Element): 193def __init__(self, content): 194super().__init__() 195self.content = [content] 196 197def __repr__(self): 198return f"Inline code: {self.content}" 199 200@property 201def tag_name(self): 202return "code" 203 204 205class Strikethrough(Container): 206def __init__(self, content): 207super().__init__(content) 208 209def __repr__(self): 210return f"Strikethrough: {repr(self.content)}" 211 212@property 213def tag_name(self): 214return "s" 215 216 217class Diff(Container): 218def __init__(self, content, value): 219super().__init__(content) 220self.value = value 221 222def __repr__(self): 223return f"Diff ({self.value}): {self.content}" 224 225@property 226def tag_name(self): 227return "ins" if self.value == "++" else "del" 228 229 230class Link(Element): 231def __init__(self, content, destination, image=False): 232super().__init__() 233self.content = content 234self.destination = destination 235self.image = image 236 237def __repr__(self): 238return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 239 240@property 241def tag_name(self): 242return "a" 243 244 245class Image(Link): 246def __init__(self, text, destination): 247super().__init__(text, destination, True) 248 249@property 250def tag_name(self): 251return "img" 252 253 254def parse_line(source): 255if trailing(source, "\\") == 1: 256source = source.rstrip("\\") 257hard_break = True 258else: 259hard_break = False 260 261tokens = [] 262pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE) 263matches = pattern.finditer(source) 264 265lookup = 0 266for i in matches: 267l = i.start() 268r = i.end() 269tokens.append(source[lookup:l]) 270 271lookup = r 272 273if i.group("em"): 274tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 275if i.group("textCode"): 276tokens.append(Code(i.group("textCode"))) 277if i.group("strike"): 278tokens.append(Strikethrough(i.group("textStrike"))) 279if i.group("diff"): 280tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 281if i.group("urlText"): 282if i.group("imageFlag"): 283tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 284else: 285tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 286 287tokens.append(source[lookup:]) 288 289if hard_break: 290tokens.append(HardBreak()) 291 292return tokens 293 294 295def tokenise(source): 296tokens = [] 297 298current_block = Element() 299 300lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces 301 302i = 0 303while i < len(lines): 304line = lines[i] 305if not line.strip(): 306# Void block 307 308tokens.append(current_block) 309current_block = Element() 310 311i += 1 312elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3: 313# Horizontal rule 314 315tokens.append(current_block) 316current_block = Rule() 317 318i += 1 319elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "): 320if not isinstance(current_block, UnorderedList): 321tokens.append(current_block) 322 323content = [] 324 325while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 326inner_content = lines[i][2:].strip() + "\n" 327i += 1 328while i < len(lines) and lines[i].startswith(" "): 329inner_content += lines[i][2:] + "\n" 330i += 1 331content.append(ListItem(inner_content)) 332 333current_block = UnorderedList(content) 334elif line.startswith("#") and leading(line.lstrip("#"), " "): 335tokens.append(current_block) 336 337content = line.lstrip("#").strip() 338current_block = Heading(content, leading(line, "#")) 339 340i += 1 341elif line.startswith(">"): 342if not isinstance(current_block, Blockquote): 343tokens.append(current_block) 344 345content = "" 346 347while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3): 348content += lines[i].lstrip(">") + "\n" 349i += 1 350 351current_block = Blockquote(content) 352elif leading(line, "~") == 3 or leading(line, "`") == 3: 353if not isinstance(current_block, CodeBlock): 354tokens.append(current_block) 355 356language = line.lstrip("`~").strip() 357 358content = "" 359i += 1 # skip the opening fence 360while i < len(lines) and not lines[i].strip() in ("```", "~~~"): 361content += lines[i] + "\n" 362i += 1 363 364if i < len(lines): 365i += 1 # prevent a new block from beginning with the closing fence 366 367current_block = CodeBlock(content, language=language) 368elif only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-"): 369tokens.append(current_block) 370 371content = line.strip() 372current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2) 373 374i += 2 375else: 376if not isinstance(current_block, Paragraph): 377# Create a paragraph, if there is no other specifier 378tokens.append(current_block) 379 380content = "" 381 382while (i < len(lines) 383and not lines[i].startswith("#") 384and not lines[i].startswith(">") 385and not lines[i].startswith("* ") 386and not lines[i].startswith("+ ") 387and not lines[i].startswith("- ") 388and not lines[i].startswith("~~~") 389and not lines[i].startswith("```") 390and lines[i].strip()): 391content += lines[i].strip() + "\n" 392i += 1 393 394current_block = Paragraph(content) 395 396tokens.append(current_block) 397 398return tokens 399 400 401def make_html(ast): 402soup = beautifulsoup.BeautifulSoup() 403for i in ast: 404# Use bs4 to generate HTML 405if isinstance(i, str): 406soup.append(i) 407elif hasattr(i, "content") and i.tag_name != "m-void": 408tag = soup.new_tag(str(i.tag_name)) 409if i.tag_name == "a": 410tag["href"] = i.destination 411if i.tag_name == "img": 412tag["src"] = i.destination 413if i.tag_name == "pre": 414tag["data-language"] = i.language 415if i.classes: 416tag["class"] = " ".join(i.classes) 417try: 418if isinstance(i.content, list): 419tag.append(make_html(i.content)) 420elif i.content: 421tag.string = i.content 422except AttributeError as exc: 423# print(i) 424print(exc, file=sys.stderr) 425... 426soup.append(tag) 427return soup 428 429 430if __name__ == '__main__': 431# Generate an AST from a markdown file 432ast = tokenise( 433r""" 434Hello World 435=========== 436 437Lorem 438ipsum 439dolor 440sit 441amet. 442 4431. Test 4442. Test 4453. Test 446 447* Lorem 448ipsum 449* Test 450* Test 451""" 452) 453# for i in ast: 454# print(repr(i)) 455 456# Now convert the AST to HTML 457print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4))) 458 459