markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5 6def only_chars(string, chars): 7chars = set(chars) 8all_chars = set(string) 9return all_chars.issubset(chars) 10 11 12inlineRegex = r""" 13(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 14| 15(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side 16| 17(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side 18| 19[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 20| 21(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 22| 23(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 24""" 25 26 27def leading(string, character): 28return len(string) - len(string.lstrip(character)) 29 30 31def trailing(string, character): 32return len(string) - len(string.rstrip(character)) 33 34 35class Element: 36def __init__(self): 37self.classes = [] 38self.content = None 39pass 40 41def __repr__(self): 42return "Void block" 43 44@property 45def tag_name(self): 46return "m-void" 47 48 49class Container(Element): 50def __init__(self, content): 51super().__init__() 52self.content = parse_line(content) 53 54def __repr__(self): 55return "Generic container element: " + repr(self.content) 56 57 58class Rule(Element): 59def __init__(self): 60super().__init__() 61 62def __repr__(self): 63return "Rule" 64 65@property 66def tag_name(self): 67return "hr" 68 69 70class HardBreak(Element): 71def __init__(self): 72super().__init__() 73 74def __repr__(self): 75return "Hard break" 76 77@property 78def tag_name(self): 79return "br" 80 81 82class Heading(Container): 83def __init__(self, content, level): 84super().__init__(content) 85self.level = level 86pass 87 88def __repr__(self): 89return f"Heading level {self.level}:\n\t" + repr(self.content) 90 91@property 92def tag_name(self): 93return "h" + str(self.level) 94 95 96class Paragraph(Container): 97def __init__(self, content): 98super().__init__("") 99self.content = parse_line(content) 100 101def __repr__(self): 102return "Paragraph:\n\t" + repr(self.content) 103 104@property 105def tag_name(self): 106return "p" 107 108 109class CodeBlock(Element): 110def __init__(self, content, language="text"): 111super().__init__() 112self.content = content 113self.language = language 114 115def __repr__(self): 116return f"Code block ({self.language}):\n\t" + repr(self.content) 117 118@property 119def tag_name(self): 120return "pre" 121 122 123class UnorderedList(Element): 124def __init__(self, content): 125super().__init__() 126self.content = content 127 128def __repr__(self): 129return "Unordered list:\n\t" + repr(self.content) 130 131@property 132def tag_name(self): 133return "ul" 134 135 136class OrderedList(Element): 137def __init__(self, content): 138super().__init__() 139self.content = content 140 141def __repr__(self): 142return "Ordered list:\n\t" + repr(self.content) 143 144@property 145def tag_name(self): 146return "ol" 147 148 149class ListItem(Element): 150def __init__(self, content): 151super().__init__() 152self.content = tokenise(content) 153 154def __repr__(self): 155return "List item:\n\t" + repr(self.content) 156 157@property 158def tag_name(self): 159return "li" 160 161 162class Blockquote(Paragraph): 163def __init__(self, content): 164super().__init__("") 165self.content = tokenise(content) 166 167def __repr__(self): 168return "Blockquote:\n\t" + repr(self.content) 169 170@property 171def tag_name(self): 172return "blockquote" 173 174 175class Emphasis(Container): 176def __init__(self, content, value): 177super().__init__(content) 178self.value = value 179if value >= 4: 180self.classes.append("emphasis-3") 181if value % 4 >= 2: 182self.classes.append("emphasis-2") 183if value % 2: 184self.classes.append("emphasis-1") 185 186def __repr__(self): 187return f"Emphasis ({self.value}): " + repr(self.content) 188 189@property 190def tag_name(self): 191return "em" if self.value == 1 else "strong" 192 193 194class Code(Element): 195def __init__(self, content): 196super().__init__() 197self.content = [content] 198 199def __repr__(self): 200return f"Inline code: {self.content}" 201 202@property 203def tag_name(self): 204return "code" 205 206 207class Strikethrough(Container): 208def __init__(self, content): 209super().__init__(content) 210 211def __repr__(self): 212return f"Strikethrough: {repr(self.content)}" 213 214@property 215def tag_name(self): 216return "s" 217 218 219class Diff(Container): 220def __init__(self, content, value): 221super().__init__(content) 222self.value = value 223 224def __repr__(self): 225return f"Diff ({self.value}): {self.content}" 226 227@property 228def tag_name(self): 229return "ins" if self.value == "++" else "del" 230 231 232class Link(Element): 233def __init__(self, content, destination, image=False): 234super().__init__() 235self.content = parse_line(content) 236self.destination = destination 237self.image = image 238 239def __repr__(self): 240return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 241 242@property 243def tag_name(self): 244return "a" 245 246 247class Image(Link): 248def __init__(self, text, destination): 249super().__init__(text, destination, True) 250 251@property 252def tag_name(self): 253return "img" 254 255 256def parse_line(source): 257if trailing(source, "\\") == 1: 258source = source.rstrip("\\") 259hard_break = True 260else: 261hard_break = False 262 263tokens = [] 264pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE) 265matches = pattern.finditer(source) 266 267lookup = 0 268for i in matches: 269l = i.start() 270r = i.end() 271tokens.append(source[lookup:l]) 272 273lookup = r 274 275if i.group("em"): 276tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 277if i.group("em2"): 278tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2")))) 279if i.group("textCode"): 280tokens.append(Code(i.group("textCode"))) 281if i.group("strike"): 282tokens.append(Strikethrough(i.group("textStrike"))) 283if i.group("diff"): 284tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 285if i.group("urlText"): 286if i.group("imageFlag"): 287tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 288else: 289tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 290 291tokens.append(source[lookup:]) 292 293if hard_break: 294tokens.append(HardBreak()) 295 296return tokens 297 298 299def tokenise(source): 300tokens = [] 301 302current_block = Element() 303 304lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces 305 306i = 0 307while i < len(lines): 308line = lines[i] 309if not line.strip() or line.startswith(";"): 310# Void block 311 312tokens.append(current_block) 313current_block = Element() 314 315i += 1 316elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3: 317# Horizontal rule 318 319tokens.append(current_block) 320current_block = Rule() 321 322i += 1 323elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "): 324if not isinstance(current_block, UnorderedList): 325tokens.append(current_block) 326 327content = [] 328 329while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 330inner_content = lines[i][2:].strip() + "\n" # discard marker and space 331i += 1 332while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 333inner_content += lines[i][1:] + "\n" 334i += 1 335 336content.append(ListItem(inner_content)) 337 338current_block = UnorderedList(content) 339elif re.match(r"^\d+\.", line): 340if not isinstance(current_block, UnorderedList): 341tokens.append(current_block) 342 343content = [] 344 345while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1: 346inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period 347i += 1 348marker_length = len(lines[i].split(".", 1)[0]) + 1 349while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]): 350if re.match(r"^ \d+\.", lines[i]): 351marker_length = 2 352inner_content += lines[i][marker_length:] + "\n" 353i += 1 354 355content.append(ListItem(inner_content)) 356 357current_block = OrderedList(content) 358elif line.startswith("#") and leading(line.lstrip("#"), " "): 359tokens.append(current_block) 360 361content = line.lstrip("#").strip() 362current_block = Heading(content, leading(line, "#")) 363 364i += 1 365elif line.startswith(">"): 366if not isinstance(current_block, Blockquote): 367tokens.append(current_block) 368 369content = "" 370 371while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3): 372content += lines[i].lstrip(">") + "\n" 373i += 1 374 375current_block = Blockquote(content) 376elif leading(line, "~") == 3 or leading(line, "`") == 3: 377if not isinstance(current_block, CodeBlock): 378tokens.append(current_block) 379 380language = line.lstrip("`~").strip() 381 382content = "" 383i += 1 # skip the opening fence 384while i < len(lines) and not lines[i].strip() in ("```", "~~~"): 385content += lines[i] + "\n" 386i += 1 387 388if i < len(lines): 389i += 1 # prevent a new block from beginning with the closing fence 390 391current_block = CodeBlock(content, language=language) 392elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip(): 393tokens.append(current_block) 394 395content = line.strip() 396current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2) 397 398i += 2 399else: 400if not isinstance(current_block, Paragraph): 401# Create a paragraph, if there is no other specifier 402tokens.append(current_block) 403 404content = "" 405 406while (i < len(lines) 407and not lines[i].startswith("#") 408and not lines[i].startswith(">") 409and not lines[i].startswith(";") 410and not lines[i].startswith("* ") 411and not lines[i].startswith("+ ") 412and not lines[i].startswith("- ") 413and not lines[i].startswith("~~~") 414and not lines[i].startswith("```") 415and not re.match(r"^\d+\.", lines[i]) 416and lines[i].strip()): 417content += lines[i].strip() + "\n" 418i += 1 419 420current_block = Paragraph(content) 421 422tokens.append(current_block) 423 424return tokens 425 426 427def make_html(ast): 428soup = beautifulsoup.BeautifulSoup() 429for i in ast: 430# Use bs4 to generate HTML 431if isinstance(i, str): 432soup.append(i) 433elif hasattr(i, "content") and i.tag_name != "m-void": 434tag = soup.new_tag(str(i.tag_name)) 435if i.tag_name == "a": 436tag["href"] = i.destination 437if i.tag_name == "img": 438tag["src"] = i.destination 439tag["alt"] = " ".join(i.content) 440if i.tag_name == "pre": 441tag["data-language"] = i.language 442if i.classes: 443tag["class"] = " ".join(i.classes) 444try: 445if isinstance(i.content, list): 446tag.append(make_html(i.content)) 447elif i.content and i.tag_name != "img": 448tag.string = i.content 449 450if i.tag_name == "img": 451tag.string = "" 452except AttributeError as exc: 453# print(i) 454print(exc, file=sys.stderr) 455soup.append(tag) 456return soup 457 458 459def markdown2html(markdown): 460return make_html(tokenise(markdown)) 461 462