markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5 6def only_chars(string, chars): 7chars = set(chars) 8all_chars = set(string) 9return all_chars.issubset(chars) 10 11 12inline_regex = r""" 13(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 14| 15<(?P<urlDestination2>[^<>]*)> # autolink 16| 17(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side 18| 19(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side 20| 21[``] (?P<textCode2>(?:\\[``]|[^``])*) [``] # inline code (2 backticks) 22| 23[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 24| 25(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 26| 27(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 28""" 29 30 31def leading(string, character): 32return len(string) - len(string.lstrip(character)) 33 34 35def trailing(string, character): 36return len(string) - len(string.rstrip(character)) 37 38 39class Element: 40def __init__(self): 41self.classes = [] 42self.content = None 43pass 44 45def __repr__(self): 46return "Void block" 47 48@property 49def tag_name(self): 50return "m-void" 51 52 53class Container(Element): 54def __init__(self, content): 55super().__init__() 56self.content = parse_line(content) 57 58def __repr__(self): 59return "Generic container element: " + repr(self.content) 60 61 62class Rule(Element): 63def __init__(self): 64super().__init__() 65 66def __repr__(self): 67return "Rule" 68 69@property 70def tag_name(self): 71return "hr" 72 73 74class HardBreak(Element): 75def __init__(self): 76super().__init__() 77 78def __repr__(self): 79return "Hard break" 80 81@property 82def tag_name(self): 83return "br" 84 85 86class Heading(Container): 87def __init__(self, content, level): 88super().__init__(content) 89self.level = level 90pass 91 92def __repr__(self): 93return f"Heading level {self.level}:\n\t" + repr(self.content) 94 95@property 96def tag_name(self): 97return "h" + str(self.level) 98 99 100class Paragraph(Container): 101def __init__(self, content): 102super().__init__("") 103self.content = parse_line(content) 104 105def __repr__(self): 106return "Paragraph:\n\t" + repr(self.content) 107 108@property 109def tag_name(self): 110return "p" 111 112 113class CodeBlock(Element): 114def __init__(self, content, language="text"): 115super().__init__() 116self.content = content 117self.language = language 118 119def __repr__(self): 120return f"Code block ({self.language}):\n\t" + repr(self.content) 121 122@property 123def tag_name(self): 124return "pre" 125 126 127class UnorderedList(Element): 128def __init__(self, content): 129super().__init__() 130self.content = content 131 132def __repr__(self): 133return "Unordered list:\n\t" + repr(self.content) 134 135@property 136def tag_name(self): 137return "ul" 138 139 140class OrderedList(Element): 141def __init__(self, content): 142super().__init__() 143self.content = content 144 145def __repr__(self): 146return "Ordered list:\n\t" + repr(self.content) 147 148@property 149def tag_name(self): 150return "ol" 151 152 153class ListItem(Element): 154def __init__(self, content): 155super().__init__() 156self.content = tokenise(content) 157 158def __repr__(self): 159return "List item:\n\t" + repr(self.content) 160 161@property 162def tag_name(self): 163return "li" 164 165 166class Blockquote(Paragraph): 167def __init__(self, content): 168super().__init__("") 169self.content = tokenise(content) 170 171def __repr__(self): 172return "Blockquote:\n\t" + repr(self.content) 173 174@property 175def tag_name(self): 176return "blockquote" 177 178 179class Emphasis(Container): 180def __init__(self, content, value): 181super().__init__(content) 182self.value = value 183if value >= 4: 184self.classes.append("emphasis-3") 185if value % 4 >= 2: 186self.classes.append("emphasis-2") 187if value % 2: 188self.classes.append("emphasis-1") 189 190def __repr__(self): 191return f"Emphasis ({self.value}): " + repr(self.content) 192 193@property 194def tag_name(self): 195return "em" if self.value == 1 else "strong" 196 197 198class Code(Element): 199def __init__(self, content): 200super().__init__() 201self.content = [content] 202 203def __repr__(self): 204return f"Inline code: {self.content}" 205 206@property 207def tag_name(self): 208return "code" 209 210 211class Strikethrough(Container): 212def __init__(self, content): 213super().__init__(content) 214 215def __repr__(self): 216return f"Strikethrough: {repr(self.content)}" 217 218@property 219def tag_name(self): 220return "s" 221 222 223class Diff(Container): 224def __init__(self, content, value): 225super().__init__(content) 226self.value = value 227 228def __repr__(self): 229return f"Diff ({self.value}): {self.content}" 230 231@property 232def tag_name(self): 233return "ins" if self.value == "++" else "del" 234 235 236class Link(Element): 237def __init__(self, content, destination, image=False): 238super().__init__() 239self.content = parse_line(content) 240self.destination = destination 241self.image = image 242 243def __repr__(self): 244return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 245 246@property 247def tag_name(self): 248return "a" 249 250 251class Image(Link): 252def __init__(self, text, destination): 253super().__init__(text, destination, True) 254 255@property 256def tag_name(self): 257return "img" 258 259 260def parse_line(source): 261if trailing(source, "\\") == 1: 262source = source.rstrip("\\") 263hard_break = True 264else: 265hard_break = False 266 267tokens = [] 268pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE) 269matches = pattern.finditer(source) 270 271lookup = 0 272for i in matches: 273l = i.start() 274r = i.end() 275tokens.append(source[lookup:l]) 276 277lookup = r 278 279if i.group("em"): 280tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 281if i.group("em2"): 282tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2")))) 283if i.group("textCode"): 284tokens.append(Code(i.group("textCode"))) 285if i.group("textCode2"): 286tokens.append(Code(i.group("textCode2"))) 287if i.group("strike"): 288tokens.append(Strikethrough(i.group("textStrike"))) 289if i.group("diff"): 290tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 291if i.group("urlText"): 292if i.group("imageFlag"): 293tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 294else: 295tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 296if i.group("urlDestination2"): 297if "://" not in i.group("urlDestination2"): 298url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes 299url_destination = i.group("urlDestination2") 300if url_destination.startswith("mailto:"): 301url_destination = url_destination.replace("@", "@") # prevent email harvesting 302url_text = url_text.replace("@", "@") # prevent protocol injection 303else: 304url_text = url_destination = i.group("urlDestination2") 305 306tokens.append(Link(url_text, url_destination)) 307 308tokens.append(source[lookup:]) 309 310if hard_break: 311tokens.append(HardBreak()) 312 313return tokens 314 315 316def tokenise(source): 317tokens = [] 318 319current_block = Element() 320 321lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces 322 323i = 0 324while i < len(lines): 325line = lines[i] 326if not line.strip() or line.startswith(";"): 327# Void block 328 329tokens.append(current_block) 330current_block = Element() 331 332i += 1 333elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3: 334# Horizontal rule 335 336tokens.append(current_block) 337current_block = Rule() 338 339i += 1 340elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "): 341if not isinstance(current_block, UnorderedList): 342tokens.append(current_block) 343 344content = [] 345 346while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 347inner_content = lines[i][2:].strip() + "\n" # discard marker and space 348i += 1 349while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "): 350inner_content += lines[i] + "\n" 351i += 1 352 353content.append(ListItem(inner_content)) 354 355current_block = UnorderedList(content) 356elif re.match(r"^\d+\.", line): 357if not isinstance(current_block, UnorderedList): 358tokens.append(current_block) 359 360content = [] 361 362while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1: 363inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period 364i += 1 365marker_length = len(lines[i].split(".", 1)[0]) + 1 366while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]): 367if re.match(r"^ \d+\.", lines[i]): 368marker_length = 2 369inner_content += lines[i][marker_length:] + "\n" 370i += 1 371 372content.append(ListItem(inner_content)) 373 374current_block = OrderedList(content) 375elif line.startswith("#") and leading(line.lstrip("#"), " "): 376tokens.append(current_block) 377 378content = line.lstrip("#").strip() 379current_block = Heading(content, leading(line, "#")) 380 381i += 1 382elif line.startswith(">"): 383if not isinstance(current_block, Blockquote): 384tokens.append(current_block) 385 386content = "" 387 388while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3): 389content += lines[i].lstrip(">") + "\n" 390i += 1 391 392current_block = Blockquote(content) 393elif leading(line, "~") == 3 or leading(line, "`") == 3: 394if not isinstance(current_block, CodeBlock): 395tokens.append(current_block) 396 397language = line.lstrip("`~").strip() 398 399content = "" 400i += 1 # skip the opening fence 401while i < len(lines) and not lines[i].strip() in ("```", "~~~"): 402content += lines[i] + "\n" 403i += 1 404 405if i < len(lines): 406i += 1 # prevent a new block from beginning with the closing fence 407 408current_block = CodeBlock(content, language=language) 409elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip(): 410tokens.append(current_block) 411 412content = line.strip() 413current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2) 414 415i += 2 416else: 417if not isinstance(current_block, Paragraph): 418# Create a paragraph, if there is no other specifier 419tokens.append(current_block) 420 421content = "" 422 423while (i < len(lines) 424and not lines[i].startswith("#") 425and not lines[i].startswith(">") 426and not lines[i].startswith(";") 427and not lines[i].startswith("* ") 428and not lines[i].startswith("+ ") 429and not lines[i].startswith("- ") 430and not lines[i].startswith("~~~") 431and not lines[i].startswith("```") 432and not re.match(r"^\d+\.", lines[i]) 433and lines[i].strip()): 434content += lines[i].strip() + "\n" 435i += 1 436 437current_block = Paragraph(content) 438 439tokens.append(current_block) 440 441return tokens 442 443 444def make_html(ast): 445soup = beautifulsoup.BeautifulSoup() 446for i in ast: 447# Use bs4 to generate HTML 448if isinstance(i, str): 449soup.append(i) 450elif hasattr(i, "content") and i.tag_name != "m-void": 451tag = soup.new_tag(str(i.tag_name)) 452if i.tag_name == "a": 453tag["href"] = i.destination 454if i.tag_name == "img": 455tag["src"] = i.destination 456tag["alt"] = " ".join(i.content) 457if i.tag_name == "pre": 458tag["data-language"] = i.language 459if i.classes: 460tag["class"] = " ".join(i.classes) 461try: 462if isinstance(i.content, list): 463tag.append(make_html(i.content)) 464elif i.content and i.tag_name != "img": 465tag.string = i.content 466 467if i.tag_name == "img": 468tag.string = "" 469except AttributeError as exc: 470# print(i) 471print(exc, file=sys.stderr) 472soup.append(tag) 473return soup 474 475 476def markdown2html(markdown): 477return make_html(tokenise(markdown)) 478 479