markdown.py
Python script, ASCII text executable
1import re 2import bs4 as beautifulsoup 3import sys 4 5 6def only_chars(string, chars): 7chars = set(chars) 8all_chars = set(string) 9return all_chars.issubset(chars) 10 11 12inline_regex = r""" 13(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media 14| 15<(?P<urlDestination2>[^<>]*)> # autolink 16| 17(?P<em>\*{1,7}) (?P<textEm>(?:\\\*|[^*])*) (?P=em) # emphasis with * not requiring space on either side 18| 19(?:^|\s)(?P<em2>_{1,7}) (?P<textEm2>(?:\\.|[^*])*) (?P=em2)(?=\s|$) # emphasis with _ requiring space on at least one side 20| 21[``] (?P<textCode>(?:\\[``]|[^``])*) [``] # inline code (2 backticks) 22| 23[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code 24| 25(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough 26| 27(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs 28""" 29 30 31def leading(string, character): 32return len(string) - len(string.lstrip(character)) 33 34 35def trailing(string, character): 36return len(string) - len(string.rstrip(character)) 37 38 39class Element: 40def __init__(self): 41self.classes = [] 42self.content = None 43pass 44 45def __repr__(self): 46return "Void block" 47 48@property 49def tag_name(self): 50return "m-void" 51 52 53class Container(Element): 54def __init__(self, content): 55super().__init__() 56self.content = parse_line(content) 57 58def __repr__(self): 59return "Generic container element: " + repr(self.content) 60 61 62class Rule(Element): 63def __init__(self): 64super().__init__() 65 66def __repr__(self): 67return "Rule" 68 69@property 70def tag_name(self): 71return "hr" 72 73 74class HardBreak(Element): 75def __init__(self): 76super().__init__() 77 78def __repr__(self): 79return "Hard break" 80 81@property 82def tag_name(self): 83return "br" 84 85 86class Heading(Container): 87def __init__(self, content, level): 88super().__init__(content) 89self.level = level 90pass 91 92def __repr__(self): 93return f"Heading level {self.level}:\n\t" + repr(self.content) 94 95@property 96def tag_name(self): 97return "h" + str(self.level) 98 99 100class Paragraph(Container): 101def __init__(self, content): 102super().__init__("") 103self.content = parse_line(content) 104 105def __repr__(self): 106return "Paragraph:\n\t" + repr(self.content) 107 108@property 109def tag_name(self): 110return "p" 111 112 113class CodeBlock(Element): 114def __init__(self, content, language="text"): 115super().__init__() 116self.content = content 117self.language = language 118 119def __repr__(self): 120return f"Code block ({self.language}):\n\t" + repr(self.content) 121 122@property 123def tag_name(self): 124return "pre" 125 126 127class UnorderedList(Element): 128def __init__(self, content): 129super().__init__() 130self.content = content 131 132def __repr__(self): 133return "Unordered list:\n\t" + repr(self.content) 134 135@property 136def tag_name(self): 137return "ul" 138 139 140class OrderedList(Element): 141def __init__(self, content): 142super().__init__() 143self.content = content 144 145def __repr__(self): 146return "Ordered list:\n\t" + repr(self.content) 147 148@property 149def tag_name(self): 150return "ol" 151 152 153class ListItem(Element): 154def __init__(self, content): 155super().__init__() 156self.content = tokenise(content) 157 158def __repr__(self): 159return "List item:\n\t" + repr(self.content) 160 161@property 162def tag_name(self): 163return "li" 164 165 166class Blockquote(Paragraph): 167def __init__(self, content): 168super().__init__("") 169self.content = tokenise(content) 170 171def __repr__(self): 172return "Blockquote:\n\t" + repr(self.content) 173 174@property 175def tag_name(self): 176return "blockquote" 177 178 179class Emphasis(Container): 180def __init__(self, content, value): 181super().__init__(content) 182self.value = value 183if value >= 4: 184self.classes.append("emphasis-3") 185if value % 4 >= 2: 186self.classes.append("emphasis-2") 187if value % 2: 188self.classes.append("emphasis-1") 189 190def __repr__(self): 191return f"Emphasis ({self.value}): " + repr(self.content) 192 193@property 194def tag_name(self): 195return "em" if self.value == 1 else "strong" 196 197 198class Code(Element): 199def __init__(self, content): 200super().__init__() 201self.content = [content] 202 203def __repr__(self): 204return f"Inline code: {self.content}" 205 206@property 207def tag_name(self): 208return "code" 209 210 211class Strikethrough(Container): 212def __init__(self, content): 213super().__init__(content) 214 215def __repr__(self): 216return f"Strikethrough: {repr(self.content)}" 217 218@property 219def tag_name(self): 220return "s" 221 222 223class Diff(Container): 224def __init__(self, content, value): 225super().__init__(content) 226self.value = value 227 228def __repr__(self): 229return f"Diff ({self.value}): {self.content}" 230 231@property 232def tag_name(self): 233return "ins" if self.value == "++" else "del" 234 235 236class Link(Element): 237def __init__(self, content, destination, image=False): 238super().__init__() 239self.content = parse_line(content) 240self.destination = destination 241self.image = image 242 243def __repr__(self): 244return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}" 245 246@property 247def tag_name(self): 248return "a" 249 250 251class Image(Link): 252def __init__(self, text, destination): 253super().__init__(text, destination, True) 254 255@property 256def tag_name(self): 257return "img" 258 259 260def parse_line(source): 261if trailing(source, "\\") == 1: 262source = source.rstrip("\\") 263hard_break = True 264else: 265hard_break = False 266 267tokens = [] 268pattern = re.compile(inline_regex, re.MULTILINE | re.DOTALL | re.VERBOSE) 269matches = pattern.finditer(source) 270 271lookup = 0 272for i in matches: 273l = i.start() 274r = i.end() 275tokens.append(source[lookup:l]) 276 277lookup = r 278 279if i.group("em"): 280tokens.append(Emphasis(i.group("textEm"), len(i.group("em")))) 281if i.group("em2"): 282tokens.append(Emphasis(i.group("textEm2"), len(i.group("em2")))) 283if i.group("textCode"): 284tokens.append(Code(i.group("textCode"))) 285if i.group("strike"): 286tokens.append(Strikethrough(i.group("textStrike"))) 287if i.group("diff"): 288tokens.append(Diff(i.group("textDiff"), i.group("diff"))) 289if i.group("urlText"): 290if i.group("imageFlag"): 291tokens.append(Image(i.group("urlText"), i.group("urlDestination"))) 292else: 293tokens.append(Link(i.group("urlText"), i.group("urlDestination"))) 294if i.group("urlDestination2"): 295if "://" not in i.group("urlDestination2"): 296url_text = i.group("urlDestination2").partition(":")[2] # remove tel, mailto, sms prefixes 297url_destination = i.group("urlDestination2") 298if url_destination.startswith("mailto:"): 299url_destination = url_destination.replace("@", "@") # prevent email harvesting 300url_text = url_text.replace("@", "@") # prevent protocol injection 301else: 302url_text = url_destination = i.group("urlDestination2") 303 304tokens.append(Link(url_text, url_destination)) 305 306tokens.append(source[lookup:]) 307 308if hard_break: 309tokens.append(HardBreak()) 310 311return tokens 312 313 314def tokenise(source): 315tokens = [] 316 317current_block = Element() 318 319lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces 320 321i = 0 322while i < len(lines): 323line = lines[i] 324if not line.strip() or line.startswith(";"): 325# Void block 326 327tokens.append(current_block) 328current_block = Element() 329 330i += 1 331elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3: 332# Horizontal rule 333 334tokens.append(current_block) 335current_block = Rule() 336 337i += 1 338elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "): 339if not isinstance(current_block, UnorderedList): 340tokens.append(current_block) 341 342content = [] 343 344while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")): 345inner_content = lines[i][2:].strip() + "\n" # discard marker and space 346i += 1 347while i < len(lines) and lines[i].strip() and not ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1] == " "): 348inner_content += lines[i] + "\n" 349i += 1 350 351content.append(ListItem(inner_content)) 352 353current_block = UnorderedList(content) 354elif re.match(r"^\d+\.", line): 355if not isinstance(current_block, UnorderedList): 356tokens.append(current_block) 357 358content = [] 359 360while i < len(lines) and re.match(r"^ ?\d+\.", lines[i]) and len(lines[i].split(".", 1)) > 1: 361inner_content = lines[i].split(".", 1)[1] + "\n" # discard number and period 362i += 1 363marker_length = len(lines[i].split(".", 1)[0]) + 1 364while i < len(lines) and lines[i].strip() and not re.match(r"^ ?\d+\.", lines[i]): 365if re.match(r"^ \d+\.", lines[i]): 366marker_length = 2 367inner_content += lines[i][marker_length:] + "\n" 368i += 1 369 370content.append(ListItem(inner_content)) 371 372current_block = OrderedList(content) 373elif line.startswith("#") and leading(line.lstrip("#"), " "): 374tokens.append(current_block) 375 376content = line.lstrip("#").strip() 377current_block = Heading(content, leading(line, "#")) 378 379i += 1 380elif line.startswith(">"): 381if not isinstance(current_block, Blockquote): 382tokens.append(current_block) 383 384content = "" 385 386while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3): 387content += lines[i].lstrip(">") + "\n" 388i += 1 389 390current_block = Blockquote(content) 391elif leading(line, "~") == 3 or leading(line, "`") == 3: 392if not isinstance(current_block, CodeBlock): 393tokens.append(current_block) 394 395language = line.lstrip("`~").strip() 396 397content = "" 398i += 1 # skip the opening fence 399while i < len(lines) and not lines[i].strip() in ("```", "~~~"): 400content += lines[i] + "\n" 401i += 1 402 403if i < len(lines): 404i += 1 # prevent a new block from beginning with the closing fence 405 406current_block = CodeBlock(content, language=language) 407elif i < len(lines) - 1 and (only_chars(lines[i+1].strip(), "=") or only_chars(lines[i+1].strip(), "-")) and lines[i+1].strip(): 408tokens.append(current_block) 409 410content = line.strip() 411current_block = Heading(content, 1 if lines[i+1].startswith("=") else 2) 412 413i += 2 414else: 415if not isinstance(current_block, Paragraph): 416# Create a paragraph, if there is no other specifier 417tokens.append(current_block) 418 419content = "" 420 421while (i < len(lines) 422and not lines[i].startswith("#") 423and not lines[i].startswith(">") 424and not lines[i].startswith(";") 425and not lines[i].startswith("* ") 426and not lines[i].startswith("+ ") 427and not lines[i].startswith("- ") 428and not lines[i].startswith("~~~") 429and not lines[i].startswith("```") 430and not re.match(r"^\d+\.", lines[i]) 431and lines[i].strip()): 432content += lines[i].strip() + "\n" 433i += 1 434 435current_block = Paragraph(content) 436 437tokens.append(current_block) 438 439return tokens 440 441 442def make_html(ast): 443soup = beautifulsoup.BeautifulSoup() 444for i in ast: 445# Use bs4 to generate HTML 446if isinstance(i, str): 447soup.append(i) 448elif hasattr(i, "content") and i.tag_name != "m-void": 449tag = soup.new_tag(str(i.tag_name)) 450if i.tag_name == "a": 451tag["href"] = i.destination 452if i.tag_name == "img": 453tag["src"] = i.destination 454tag["alt"] = " ".join(i.content) 455if i.tag_name == "pre": 456tag["data-language"] = i.language 457if i.classes: 458tag["class"] = " ".join(i.classes) 459try: 460if isinstance(i.content, list): 461tag.append(make_html(i.content)) 462elif i.content and i.tag_name != "img": 463tag.string = i.content 464 465if i.tag_name == "img": 466tag.string = "" 467except AttributeError as exc: 468# print(i) 469print(exc, file=sys.stderr) 470soup.append(tag) 471return soup 472 473 474def markdown2html(markdown): 475return make_html(tokenise(markdown)) 476 477