markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5inlineRegex = r""" 6(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis 7| 8[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 9| 10(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 11| 12(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 13| 14(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 15""" 16 17 18def leading(string, character): 19return len(string) - len(string.lstrip(character)) 20 21 22def trailing(string, character): 23return len(string) - len(string.rstrip(character)) 24 25 26class Element: 27def __init__(self): 28self.classes = [] 29self.content = None 30pass 31 32def __repr__(self): 33return "Void block" 34 35@property 36def tag_name(self): 37return "m-void" 38 39 40class Container(Element): 41def __init__(self, content): 42super().__init__() 43self.content = parse_line(content) 44 45def __repr__(self): 46return "Generic container element: " + repr(self.content) 47 48 49class Heading(Container): 50def __init__(self, content, level): 51super().__init__(content) 52self.level = level 53pass 54 55def __repr__(self): 56return f"Heading level {self.level}:\n\t" + repr(self.content) 57 58@property 59def tag_name(self): 60return "h" + str(self.level) 61 62 63class Paragraph(Container): 64def __init__(self, content): 65super().__init__("") 66self.content = parse_line(content) 67 68def __repr__(self): 69return "Paragraph:\n\t" + repr(self.content) 70 71@property 72def tag_name(self): 73return "p" 74 75 76class Blockquote(Paragraph): 77def __init__(self, content): 78super().__init__("") 79self.content = tokenise(content) 80 81def __repr__(self): 82return "Blockquote:\n\t" + repr(self.content) 83 84@property 85def tag_name(self): 86return "blockquote" 87 88 89class Emphasis(Container): 90def __init__(self, content, value): 91super().__init__(content) 92self.value = value 93if value >= 4: 94self.classes.append("emphasis-3") 95if value % 4 >= 2: 96self.classes.append("emphasis-2") 97if value % 2: 98self.classes.append("emphasis-1") 99 100def __repr__(self): 101return f"Emphasis ({self.value}): " + repr(self.content) 102 103@property 104def tag_name(self): 105return "em" if self.value == 1 else "strong" 106 107 108class Code(Element): 109def __init__(self, content): 110super().__init__() 111self.content = [content] 112 113def __repr__(self): 114return f"Inline code: {self.content}" 115 116@property 117def tag_name(self): 118return "code" 119 120 121class Strikethrough(Container): 122def __init__(self, content): 123super().__init__(content) 124 125def __repr__(self): 126return f"Strikethrough: {repr(self.content)}" 127 128@property 129def tag_name(self): 130return "s" 131 132 133class Diff(Container): 134def __init__(self, content, value): 135super().__init__(content) 136self.value = value 137 138def __repr__(self): 139return f"Diff ({self.value}): {self.content}" 140 141@property 142def tag_name(self): 143return "ins" if self.value == "++" else "del" 144 145 146class Link(Element): 147def __init__(self, content, destination, image=False): 148super().__init__() 149self.content = content 150self.destination = destination 151self.image = image 152 153def __repr__(self): 154return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 155 156@property 157def tag_name(self): 158return "a" 159 160 161class Image(Link): 162def __init__(self, text, destination): 163super().__init__(text, destination, True) 164 165@property 166def tag_name(self): 167return "img" 168 169 170def parse_line(source): 171if trailing(source, "\\") == 1: 172source = source.rstrip("\\") 173source += "\n" 174 175tokens = [] 176pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE) 177matches = pattern.finditer(source) 178 179lookup = 0 180for i in matches: 181l = i.start() 182r = i.end() 183tokens.append(source[lookup:l]) 184 185lookup = r 186 187if i.group("em"): 188tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 189if i.group("textCode"): 190tokens.append(Code(i.group("textCode"))) 191if i.group("strike"): 192tokens.append(Strikethrough(i.group("textStrike"))) 193if i.group("diff"): 194tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 195if i.group("urlText"): 196if i.group("imageFlag"): 197tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 198else: 199tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 200 201tokens.append(source[lookup:]) 202 203return tokens 204 205 206def tokenise(source): 207tokens = [] 208 209current_block = Element() 210 211lines = source.split("\n") 212 213i = 0 214while i < len(lines): 215line = lines[i] 216if not line.strip(): 217# Void block 218 219tokens.append(current_block) 220current_block = Element() 221 222i += 1 223elif line.startswith("#") and leading(line.lstrip("#"), " "): 224tokens.append(current_block) 225 226content = line.lstrip("#").strip() 227current_block = Heading(content, leading(line, "#")) 228 229i += 1 230elif line.startswith(">"): 231if not isinstance(current_block, Blockquote): 232tokens.append(current_block) 233 234content = "" 235 236while i < len(lines) and lines[i].startswith(">"): 237content += lines[i].lstrip(">").strip() + "\n" 238i += 1 239 240current_block = Blockquote(content) 241else: 242if not isinstance(current_block, Paragraph): 243# Paragraph is default 244 245tokens.append(current_block) 246 247content = "" 248 249while i < len(lines) and not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip(): 250content += lines[i].strip() + "\n" 251i += 1 252 253current_block = Paragraph(content) 254 255tokens.append(current_block) 256 257return tokens 258 259 260def make_html(ast): 261soup = beautifulsoup.BeautifulSoup() 262for i in ast: 263# Use bs4 to generate HTML 264if isinstance(i, str): 265soup.append(i) 266elif hasattr(i, "content") and i.tag_name != "m-void": 267tag = soup.new_tag(str(i.tag_name)) 268if i.tag_name == "a": 269tag["href"] = i.destination 270if i.tag_name == "img": 271tag["src"] = i.destination 272if i.classes: 273tag["class"] = " ".join(i.classes) 274try: 275if isinstance(i.content, list): 276tag.append(make_html(i.content)) 277elif i.content: 278tag.string = i.content 279except AttributeError as exc: 280# print(i) 281print(exc, file=sys.stderr) 282... 283soup.append(tag) 284return soup 285 286 287if __name__ == '__main__': 288# Generate an AST from a markdown file 289ast = tokenise( 290""" 291# Hello World! 292## Title 1 293### Part 1 294#### Chapter _1_ 295##### Article 1 296###### Section 1 297Lorem **i`p`sum** 298dolor `sit` amet 299 300> Make it as simple as possible, [but not simpler](https://wikipedia.org). 301> > If you can't explain it simply, you don't understand it well enough. 302 303... 304""" 305) 306for i in ast: 307print(repr(i)) 308 309# Now convert the AST to HTML 310print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4))) 311 312