markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5inlineRegex = r""" 6(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis 7| 8[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 9| 10(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 11| 12(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 13| 14(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 15""" 16 17 18def leading(string, character): 19return len(string) - len(string.lstrip(character)) 20 21 22def trailing(string, character): 23return len(string) - len(string.rstrip(character)) 24 25 26class Element: 27def __init__(self): 28self.classes = [] 29self.content = None 30pass 31 32def __repr__(self): 33return "Void block" 34 35@property 36def tag_name(self): 37return "m-void" 38 39 40class Container(Element): 41def __init__(self, content): 42super().__init__() 43self.content = parse_line(content) 44 45def __repr__(self): 46return "Generic container element: " + repr(self.content) 47 48 49class Heading(Container): 50def __init__(self, content, level): 51super().__init__(content) 52self.level = level 53pass 54 55def __repr__(self): 56return f"Heading level {self.level}:\n\t" + repr(self.content) 57 58@property 59def tag_name(self): 60return "h" + str(self.level) 61 62 63class Paragraph(Container): 64def __init__(self): 65super().__init__("") 66 67def addLine(self, content): 68self.content.extend([*parse_line(content), " "]) 69 70def __repr__(self): 71return "Paragraph:\n\t" + repr(self.content) 72 73@property 74def tag_name(self): 75return "p" 76 77 78class Emphasis(Container): 79def __init__(self, content, value): 80super().__init__(content) 81self.value = value 82if value >= 4: 83self.classes.append("emphasis-3") 84if value % 4 >= 2: 85self.classes.append("emphasis-2") 86if value % 2: 87self.classes.append("emphasis-1") 88 89def __repr__(self): 90return f"Emphasis ({self.value}): " + repr(self.content) 91 92@property 93def tag_name(self): 94return "em" if self.value == 1 else "strong" 95 96 97class Code(Element): 98def __init__(self, content): 99super().__init__() 100self.content = [content] 101 102def __repr__(self): 103return f"Inline code: {self.content}" 104 105@property 106def tag_name(self): 107return "code" 108 109 110class Strikethrough(Container): 111def __init__(self, content): 112super().__init__(content) 113 114def __repr__(self): 115return f"Strikethrough: {repr(self.content)}" 116 117@property 118def tag_name(self): 119return "s" 120 121 122class Diff(Container): 123def __init__(self, content, value): 124super().__init__(content) 125self.value = value 126 127def __repr__(self): 128return f"Diff ({self.value}): {self.content}" 129 130@property 131def tag_name(self): 132return "ins" if self.value == "++" else "del" 133 134 135class Link(Element): 136def __init__(self, content, destination, image=False): 137super().__init__() 138self.content = content 139self.destination = destination 140self.image = image 141 142def __repr__(self): 143return f"{'Image' if self.image else 'Link'}: {self.text} -> {self.destination}" 144 145@property 146def tag_name(self): 147return "a" 148 149 150class Image(Link): 151def __init__(self, text, destination): 152super().__init__(text, destination, True) 153 154@property 155def tag_name(self): 156return "img" 157 158 159def parse_line(source): 160if trailing(source, "\\") == 1: 161source = source.rstrip("\\") 162source += "\n" 163 164tokens = [] 165pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE) 166matches = pattern.finditer(source) 167 168lookup = 0 169for i in matches: 170l = i.start() 171r = i.end() 172tokens.append(source[lookup:l]) 173 174lookup = r 175 176if i.group("em"): 177tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 178if i.group("textCode"): 179tokens.append(Code(i.group("textCode"))) 180if i.group("strike"): 181tokens.append(Strikethrough(i.group("textStrike"))) 182if i.group("diff"): 183tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 184if i.group("urlText"): 185tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 186if i.group("imageFlag"): 187tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 188 189tokens.append(source[lookup:]) 190 191return tokens 192 193 194def tokenise(source): 195tokens = [] 196 197current_block = Element 198 199for line in source.split("\n"): 200if not line.strip(): 201# Void block 202 203tokens.append(current_block) 204current_block = Element() 205elif line.startswith("#") and leading(line.lstrip("#"), " "): 206tokens.append(current_block) 207 208content = line.lstrip("#").strip() 209current_block = Heading(content, leading(line, "#")) 210else: 211if not isinstance(current_block, Paragraph): 212# Paragraph is default 213 214tokens.append(current_block) 215current_block = Paragraph() 216 217current_block.addLine(line.strip()) 218 219tokens.append(current_block) 220 221return tokens 222 223 224def make_html(ast): 225soup = beautifulsoup.BeautifulSoup() 226for i in ast: 227# Use bs4 to generate HTML 228if isinstance(i, str): 229soup.append(i) 230elif hasattr(i, "content") and i.tag_name != "m-void": 231tag = soup.new_tag(str(i.tag_name)) 232try: 233if isinstance(i.content, list): 234tag.append(make_html(i.content)) 235elif i.content: 236tag.string = i.content 237except AttributeError as exc: 238# print(i) 239print(exc, file=sys.stderr) 240... 241soup.append(tag) 242return soup 243 244 245if __name__ == '__main__': 246# Generate an AST from a markdown file 247ast = tokenise( 248""" 249# Hello World! 250## Title 1 251### Part 1 252#### Chapter _1_ 253##### Article 1 254###### Section 1 255Lorem **i`p`sum** 256dolor `sit` amet 257 258... 259""" 260) 261# for i in ast: 262# print(repr(i)) 263 264# Now convert the AST to HTML 265print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4))) 266 267