markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5 6def only_chars(string, chars): 7chars = set(chars) 8all_chars = set(string) 9return all_chars.issubset(chars) 10 11 12inlineRegex = r""" 13(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis 14| 15[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 16| 17(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 18| 19(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 20| 21(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 22""" 23 24 25def leading(string, character): 26return len(string) - len(string.lstrip(character)) 27 28 29def trailing(string, character): 30return len(string) - len(string.rstrip(character)) 31 32 33class Element: 34def __init__(self): 35self.classes = [] 36self.content = None 37pass 38 39def __repr__(self): 40return "Void block" 41 42@property 43def tag_name(self): 44return "m-void" 45 46 47class Container(Element): 48def __init__(self, content): 49super().__init__() 50self.content = parse_line(content) 51 52def __repr__(self): 53return "Generic container element: " + repr(self.content) 54 55 56class Rule(Element): 57def __init__(self): 58super().__init__() 59 60def __repr__(self): 61return "Rule" 62 63@property 64def tag_name(self): 65return "hr" 66 67 68class HardBreak(Element): 69def __init__(self): 70super().__init__() 71 72def __repr__(self): 73return "Hard break" 74 75@property 76def tag_name(self): 77return "br" 78 79 80class Heading(Container): 81def __init__(self, content, level): 82super().__init__(content) 83self.level = level 84pass 85 86def __repr__(self): 87return f"Heading level {self.level}:\n\t" + repr(self.content) 88 89@property 90def tag_name(self): 91return "h" + str(self.level) 92 93 94class Paragraph(Container): 95def __init__(self, content): 96super().__init__("") 97self.content = parse_line(content) 98 99def __repr__(self): 100return "Paragraph:\n\t" + repr(self.content) 101 102@property 103def tag_name(self): 104return "p" 105 106 107class Blockquote(Paragraph): 108def __init__(self, content): 109super().__init__("") 110self.content = tokenise(content) 111 112def __repr__(self): 113return "Blockquote:\n\t" + repr(self.content) 114 115@property 116def tag_name(self): 117return "blockquote" 118 119 120class Emphasis(Container): 121def __init__(self, content, value): 122super().__init__(content) 123self.value = value 124if value >= 4: 125self.classes.append("emphasis-3") 126if value % 4 >= 2: 127self.classes.append("emphasis-2") 128if value % 2: 129self.classes.append("emphasis-1") 130 131def __repr__(self): 132return f"Emphasis ({self.value}): " + repr(self.content) 133 134@property 135def tag_name(self): 136return "em" if self.value == 1 else "strong" 137 138 139class Code(Element): 140def __init__(self, content): 141super().__init__() 142self.content = [content] 143 144def __repr__(self): 145return f"Inline code: {self.content}" 146 147@property 148def tag_name(self): 149return "code" 150 151 152class Strikethrough(Container): 153def __init__(self, content): 154super().__init__(content) 155 156def __repr__(self): 157return f"Strikethrough: {repr(self.content)}" 158 159@property 160def tag_name(self): 161return "s" 162 163 164class Diff(Container): 165def __init__(self, content, value): 166super().__init__(content) 167self.value = value 168 169def __repr__(self): 170return f"Diff ({self.value}): {self.content}" 171 172@property 173def tag_name(self): 174return "ins" if self.value == "++" else "del" 175 176 177class Link(Element): 178def __init__(self, content, destination, image=False): 179super().__init__() 180self.content = content 181self.destination = destination 182self.image = image 183 184def __repr__(self): 185return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 186 187@property 188def tag_name(self): 189return "a" 190 191 192class Image(Link): 193def __init__(self, text, destination): 194super().__init__(text, destination, True) 195 196@property 197def tag_name(self): 198return "img" 199 200 201def parse_line(source): 202if trailing(source, "\\") == 1: 203source = source.rstrip("\\") 204hard_break = True 205else: 206hard_break = False 207 208tokens = [] 209pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE) 210matches = pattern.finditer(source) 211 212lookup = 0 213for i in matches: 214l = i.start() 215r = i.end() 216tokens.append(source[lookup:l]) 217 218lookup = r 219 220if i.group("em"): 221tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 222if i.group("textCode"): 223tokens.append(Code(i.group("textCode"))) 224if i.group("strike"): 225tokens.append(Strikethrough(i.group("textStrike"))) 226if i.group("diff"): 227tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 228if i.group("urlText"): 229if i.group("imageFlag"): 230tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 231else: 232tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 233 234tokens.append(source[lookup:]) 235 236if hard_break: 237tokens.append(HardBreak()) 238 239return tokens 240 241 242def tokenise(source): 243tokens = [] 244 245current_block = Element() 246 247lines = source.split("\n") 248 249i = 0 250while i < len(lines): 251line = lines[i] 252if not line.strip(): 253# Void block 254 255tokens.append(current_block) 256current_block = Element() 257 258i += 1 259elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3: 260# Horizontal rule 261 262tokens.append(current_block) 263current_block = Rule() 264 265i += 1 266elif line.startswith("#") and leading(line.lstrip("#"), " "): 267tokens.append(current_block) 268 269content = line.lstrip("#").strip() 270current_block = Heading(content, leading(line, "#")) 271 272i += 1 273elif line.startswith(">"): 274if not isinstance(current_block, Blockquote): 275tokens.append(current_block) 276 277content = "" 278 279while i < len(lines) and lines[i].startswith(">"): 280content += lines[i].lstrip(">").strip() + "\n" 281i += 1 282 283current_block = Blockquote(content) 284else: 285if not isinstance(current_block, Paragraph): 286# Paragraph is default 287 288tokens.append(current_block) 289 290content = "" 291 292while i < len(lines) and not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip(): 293content += lines[i].strip() + "\n" 294i += 1 295 296current_block = Paragraph(content) 297 298tokens.append(current_block) 299 300return tokens 301 302 303def make_html(ast): 304soup = beautifulsoup.BeautifulSoup() 305for i in ast: 306# Use bs4 to generate HTML 307if isinstance(i, str): 308soup.append(i) 309elif hasattr(i, "content") and i.tag_name != "m-void": 310tag = soup.new_tag(str(i.tag_name)) 311if i.tag_name == "a": 312tag["href"] = i.destination 313if i.tag_name == "img": 314tag["src"] = i.destination 315if i.classes: 316tag["class"] = " ".join(i.classes) 317try: 318if isinstance(i.content, list): 319tag.append(make_html(i.content)) 320elif i.content: 321tag.string = i.content 322except AttributeError as exc: 323# print(i) 324print(exc, file=sys.stderr) 325... 326soup.append(tag) 327return soup 328 329 330if __name__ == '__main__': 331# Generate an AST from a markdown file 332ast = tokenise( 333r""" 334# Hello World! 335## Title 1 336### Part 1 337#### Chapter _1_ 338##### Article 1 339###### Section 1 340Lorem **i`p`sum** 341dolor `sit` amet 342 343consectetur \ 344*adipiscing* elit 345 346* * * 347 348> Make it as simple as possible, [but not simpler](https://wikipedia.org). 349> > If you can't explain it simply, you don't understand it well enough. 350 351... 352""" 353) 354for i in ast: 355print(repr(i)) 356 357# Now convert the AST to HTML 358print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4))) 359 360