By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 11.66 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
6
def only_chars(string, chars):
7
chars = set(chars)
8
all_chars = set(string)
9
return all_chars.issubset(chars)
10
11
12
inlineRegex = r"""
13
(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis
14
|
15
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
16
|
17
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
18
|
19
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
20
|
21
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
22
"""
23
24
25
def leading(string, character):
26
return len(string) - len(string.lstrip(character))
27
28
29
def trailing(string, character):
30
return len(string) - len(string.rstrip(character))
31
32
33
class Element:
34
def __init__(self):
35
self.classes = []
36
self.content = None
37
pass
38
39
def __repr__(self):
40
return "Void block"
41
42
@property
43
def tag_name(self):
44
return "m-void"
45
46
47
class Container(Element):
48
def __init__(self, content):
49
super().__init__()
50
self.content = parse_line(content)
51
52
def __repr__(self):
53
return "Generic container element: " + repr(self.content)
54
55
56
class Rule(Element):
57
def __init__(self):
58
super().__init__()
59
60
def __repr__(self):
61
return "Rule"
62
63
@property
64
def tag_name(self):
65
return "hr"
66
67
68
class HardBreak(Element):
69
def __init__(self):
70
super().__init__()
71
72
def __repr__(self):
73
return "Hard break"
74
75
@property
76
def tag_name(self):
77
return "br"
78
79
80
class Heading(Container):
81
def __init__(self, content, level):
82
super().__init__(content)
83
self.level = level
84
pass
85
86
def __repr__(self):
87
return f"Heading level {self.level}:\n\t" + repr(self.content)
88
89
@property
90
def tag_name(self):
91
return "h" + str(self.level)
92
93
94
class Paragraph(Container):
95
def __init__(self, content):
96
super().__init__("")
97
self.content = parse_line(content)
98
99
def __repr__(self):
100
return "Paragraph:\n\t" + repr(self.content)
101
102
@property
103
def tag_name(self):
104
return "p"
105
106
107
class CodeBlock(Element):
108
def __init__(self, content, language="text"):
109
super().__init__()
110
self.content = content
111
self.language = language
112
113
def __repr__(self):
114
return f"Code block ({self.language}):\n\t" + repr(self.content)
115
116
@property
117
def tag_name(self):
118
return "pre"
119
120
121
class UnorderedList(Element):
122
def __init__(self, content):
123
super().__init__()
124
self.content = content
125
126
def __repr__(self):
127
return "Unordered list:\n\t" + repr(self.content)
128
129
@property
130
def tag_name(self):
131
return "ul"
132
133
134
class OrderedList(Element):
135
def __init__(self, content):
136
super().__init__()
137
self.content = content
138
139
def __repr__(self):
140
return "Ordered list:\n\t" + repr(self.content)
141
142
@property
143
def tag_name(self):
144
return "ol"
145
146
147
class ListItem(Paragraph):
148
def __init__(self, content):
149
super().__init__("")
150
self.content = tokenise(content)
151
152
def __repr__(self):
153
return "List item:\n\t" + repr(self.content)
154
155
@property
156
def tag_name(self):
157
return "li"
158
159
160
class Blockquote(Paragraph):
161
def __init__(self, content):
162
super().__init__("")
163
self.content = tokenise(content)
164
165
def __repr__(self):
166
return "Blockquote:\n\t" + repr(self.content)
167
168
@property
169
def tag_name(self):
170
return "blockquote"
171
172
173
class Emphasis(Container):
174
def __init__(self, content, value):
175
super().__init__(content)
176
self.value = value
177
if value >= 4:
178
self.classes.append("emphasis-3")
179
if value % 4 >= 2:
180
self.classes.append("emphasis-2")
181
if value % 2:
182
self.classes.append("emphasis-1")
183
184
def __repr__(self):
185
return f"Emphasis ({self.value}): " + repr(self.content)
186
187
@property
188
def tag_name(self):
189
return "em" if self.value == 1 else "strong"
190
191
192
class Code(Element):
193
def __init__(self, content):
194
super().__init__()
195
self.content = [content]
196
197
def __repr__(self):
198
return f"Inline code: {self.content}"
199
200
@property
201
def tag_name(self):
202
return "code"
203
204
205
class Strikethrough(Container):
206
def __init__(self, content):
207
super().__init__(content)
208
209
def __repr__(self):
210
return f"Strikethrough: {repr(self.content)}"
211
212
@property
213
def tag_name(self):
214
return "s"
215
216
217
class Diff(Container):
218
def __init__(self, content, value):
219
super().__init__(content)
220
self.value = value
221
222
def __repr__(self):
223
return f"Diff ({self.value}): {self.content}"
224
225
@property
226
def tag_name(self):
227
return "ins" if self.value == "++" else "del"
228
229
230
class Link(Element):
231
def __init__(self, content, destination, image=False):
232
super().__init__()
233
self.content = content
234
self.destination = destination
235
self.image = image
236
237
def __repr__(self):
238
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
239
240
@property
241
def tag_name(self):
242
return "a"
243
244
245
class Image(Link):
246
def __init__(self, text, destination):
247
super().__init__(text, destination, True)
248
249
@property
250
def tag_name(self):
251
return "img"
252
253
254
def parse_line(source):
255
if trailing(source, "\\") == 1:
256
source = source.rstrip("\\")
257
hard_break = True
258
else:
259
hard_break = False
260
261
tokens = []
262
pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE)
263
matches = pattern.finditer(source)
264
265
lookup = 0
266
for i in matches:
267
l = i.start()
268
r = i.end()
269
tokens.append(source[lookup:l])
270
271
lookup = r
272
273
if i.group("em"):
274
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
275
if i.group("textCode"):
276
tokens.append(Code(i.group("textCode")))
277
if i.group("strike"):
278
tokens.append(Strikethrough(i.group("textStrike")))
279
if i.group("diff"):
280
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
281
if i.group("urlText"):
282
if i.group("imageFlag"):
283
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
284
else:
285
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
286
287
tokens.append(source[lookup:])
288
289
if hard_break:
290
tokens.append(HardBreak())
291
292
return tokens
293
294
295
def tokenise(source):
296
tokens = []
297
298
current_block = Element()
299
300
lines = [line[1:] if line.startswith(" ") else line for line in source.split("\n")] # remove leading spaces
301
302
i = 0
303
while i < len(lines):
304
line = lines[i]
305
print(i, line)
306
if not line.strip():
307
# Void block
308
309
tokens.append(current_block)
310
current_block = Element()
311
312
i += 1
313
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
314
# Horizontal rule
315
316
tokens.append(current_block)
317
current_block = Rule()
318
319
i += 1
320
elif (lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" "):
321
if not isinstance(current_block, UnorderedList):
322
tokens.append(current_block)
323
324
content = []
325
326
while i < len(lines) and ((lines[i].startswith("*") or lines[i].startswith("+") or lines[i].startswith("-")) and lines[i][1:].startswith(" ")):
327
inner_content = lines[i][2:].strip() + "\n"
328
i += 1
329
while i < len(lines) and lines[i].startswith(" "):
330
inner_content += lines[i][2:] + "\n"
331
i += 1
332
content.append(ListItem(inner_content))
333
334
current_block = UnorderedList(content)
335
elif line.startswith("#") and leading(line.lstrip("#"), " "):
336
tokens.append(current_block)
337
338
content = line.lstrip("#").strip()
339
current_block = Heading(content, leading(line, "#"))
340
341
i += 1
342
elif line.startswith(">"):
343
if not isinstance(current_block, Blockquote):
344
tokens.append(current_block)
345
346
content = ""
347
348
while i < len(lines) and (lines[i].startswith(">") or (not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip()) and not only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3):
349
content += lines[i].lstrip(">") + "\n"
350
i += 1
351
352
current_block = Blockquote(content)
353
elif leading(line, "~") == 3 or leading(line, "`") == 3:
354
if not isinstance(current_block, CodeBlock):
355
tokens.append(current_block)
356
357
language = line.lstrip("`~").strip()
358
359
content = ""
360
i += 1 # skip the opening fence
361
while i < len(lines) and not lines[i].strip() in ("```", "~~~"):
362
content += lines[i] + "\n"
363
i += 1
364
365
if i < len(lines):
366
i += 1 # prevent a new block from beginning with the closing fence
367
368
current_block = CodeBlock(content, language=language)
369
else:
370
if not isinstance(current_block, Paragraph):
371
# Create a paragraph, if there is no other specifier
372
tokens.append(current_block)
373
374
content = ""
375
376
while (i < len(lines)
377
and not lines[i].startswith("#")
378
and not lines[i].startswith(">")
379
and not lines[i].startswith("* ")
380
and not lines[i].startswith("+ ")
381
and not lines[i].startswith("- ")
382
and not lines[i].startswith("~~~")
383
and not lines[i].startswith("```")
384
and lines[i].strip()):
385
content += lines[i].strip() + "\n"
386
i += 1
387
388
current_block = Paragraph(content)
389
390
tokens.append(current_block)
391
392
return tokens
393
394
395
def make_html(ast):
396
soup = beautifulsoup.BeautifulSoup()
397
for i in ast:
398
# Use bs4 to generate HTML
399
if isinstance(i, str):
400
soup.append(i)
401
elif hasattr(i, "content") and i.tag_name != "m-void":
402
tag = soup.new_tag(str(i.tag_name))
403
if i.tag_name == "a":
404
tag["href"] = i.destination
405
if i.tag_name == "img":
406
tag["src"] = i.destination
407
if i.tag_name == "pre":
408
tag["data-language"] = i.language
409
if i.classes:
410
tag["class"] = " ".join(i.classes)
411
try:
412
if isinstance(i.content, list):
413
tag.append(make_html(i.content))
414
elif i.content:
415
tag.string = i.content
416
except AttributeError as exc:
417
# print(i)
418
print(exc, file=sys.stderr)
419
...
420
soup.append(tag)
421
return soup
422
423
424
if __name__ == '__main__':
425
# Generate an AST from a markdown file
426
ast = tokenise(
427
r"""
428
> This is a blockquote
429
> that spans multiple lines
430
> and contains a list:
431
>
432
> - Item 1
433
> - Item 2
434
> - Item 3
435
> - Subitem 1
436
> - Subitem 2
437
>
438
> And some more text
439
>
440
Does it also support that bullet?
441
Or continuation lines?
442
> What about a code block?
443
> ```python
444
> print("Hello, world!")
445
> ```
446
447
* * *
448
449
* Alternatively, this is a list
450
* That uses asterisks
451
"""
452
)
453
# for i in ast:
454
# print(repr(i))
455
456
# Now convert the AST to HTML
457
print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4)))
458
459