By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 8.48 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
6
def only_chars(string, chars):
7
chars = set(chars)
8
all_chars = set(string)
9
return all_chars.issubset(chars)
10
11
12
inlineRegex = r"""
13
(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis
14
|
15
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
16
|
17
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
18
|
19
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
20
|
21
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
22
"""
23
24
25
def leading(string, character):
26
return len(string) - len(string.lstrip(character))
27
28
29
def trailing(string, character):
30
return len(string) - len(string.rstrip(character))
31
32
33
class Element:
34
def __init__(self):
35
self.classes = []
36
self.content = None
37
pass
38
39
def __repr__(self):
40
return "Void block"
41
42
@property
43
def tag_name(self):
44
return "m-void"
45
46
47
class Container(Element):
48
def __init__(self, content):
49
super().__init__()
50
self.content = parse_line(content)
51
52
def __repr__(self):
53
return "Generic container element: " + repr(self.content)
54
55
56
class Rule(Element):
57
def __init__(self):
58
super().__init__()
59
60
def __repr__(self):
61
return "Rule"
62
63
@property
64
def tag_name(self):
65
return "hr"
66
67
68
class HardBreak(Element):
69
def __init__(self):
70
super().__init__()
71
72
def __repr__(self):
73
return "Hard break"
74
75
@property
76
def tag_name(self):
77
return "br"
78
79
80
class Heading(Container):
81
def __init__(self, content, level):
82
super().__init__(content)
83
self.level = level
84
pass
85
86
def __repr__(self):
87
return f"Heading level {self.level}:\n\t" + repr(self.content)
88
89
@property
90
def tag_name(self):
91
return "h" + str(self.level)
92
93
94
class Paragraph(Container):
95
def __init__(self, content):
96
super().__init__("")
97
self.content = parse_line(content)
98
99
def __repr__(self):
100
return "Paragraph:\n\t" + repr(self.content)
101
102
@property
103
def tag_name(self):
104
return "p"
105
106
107
class Blockquote(Paragraph):
108
def __init__(self, content):
109
super().__init__("")
110
self.content = tokenise(content)
111
112
def __repr__(self):
113
return "Blockquote:\n\t" + repr(self.content)
114
115
@property
116
def tag_name(self):
117
return "blockquote"
118
119
120
class Emphasis(Container):
121
def __init__(self, content, value):
122
super().__init__(content)
123
self.value = value
124
if value >= 4:
125
self.classes.append("emphasis-3")
126
if value % 4 >= 2:
127
self.classes.append("emphasis-2")
128
if value % 2:
129
self.classes.append("emphasis-1")
130
131
def __repr__(self):
132
return f"Emphasis ({self.value}): " + repr(self.content)
133
134
@property
135
def tag_name(self):
136
return "em" if self.value == 1 else "strong"
137
138
139
class Code(Element):
140
def __init__(self, content):
141
super().__init__()
142
self.content = [content]
143
144
def __repr__(self):
145
return f"Inline code: {self.content}"
146
147
@property
148
def tag_name(self):
149
return "code"
150
151
152
class Strikethrough(Container):
153
def __init__(self, content):
154
super().__init__(content)
155
156
def __repr__(self):
157
return f"Strikethrough: {repr(self.content)}"
158
159
@property
160
def tag_name(self):
161
return "s"
162
163
164
class Diff(Container):
165
def __init__(self, content, value):
166
super().__init__(content)
167
self.value = value
168
169
def __repr__(self):
170
return f"Diff ({self.value}): {self.content}"
171
172
@property
173
def tag_name(self):
174
return "ins" if self.value == "++" else "del"
175
176
177
class Link(Element):
178
def __init__(self, content, destination, image=False):
179
super().__init__()
180
self.content = content
181
self.destination = destination
182
self.image = image
183
184
def __repr__(self):
185
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
186
187
@property
188
def tag_name(self):
189
return "a"
190
191
192
class Image(Link):
193
def __init__(self, text, destination):
194
super().__init__(text, destination, True)
195
196
@property
197
def tag_name(self):
198
return "img"
199
200
201
def parse_line(source):
202
if trailing(source, "\\") == 1:
203
source = source.rstrip("\\")
204
hard_break = True
205
else:
206
hard_break = False
207
208
tokens = []
209
pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE)
210
matches = pattern.finditer(source)
211
212
lookup = 0
213
for i in matches:
214
l = i.start()
215
r = i.end()
216
tokens.append(source[lookup:l])
217
218
lookup = r
219
220
if i.group("em"):
221
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
222
if i.group("textCode"):
223
tokens.append(Code(i.group("textCode")))
224
if i.group("strike"):
225
tokens.append(Strikethrough(i.group("textStrike")))
226
if i.group("diff"):
227
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
228
if i.group("urlText"):
229
if i.group("imageFlag"):
230
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
231
else:
232
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
233
234
tokens.append(source[lookup:])
235
236
if hard_break:
237
tokens.append(HardBreak())
238
239
return tokens
240
241
242
def tokenise(source):
243
tokens = []
244
245
current_block = Element()
246
247
lines = source.split("\n")
248
249
i = 0
250
while i < len(lines):
251
line = lines[i]
252
if not line.strip():
253
# Void block
254
255
tokens.append(current_block)
256
current_block = Element()
257
258
i += 1
259
elif only_chars(line.strip(), "-_* ") and len(line.strip()) >= 3:
260
# Horizontal rule
261
262
tokens.append(current_block)
263
current_block = Rule()
264
265
i += 1
266
elif line.startswith("#") and leading(line.lstrip("#"), " "):
267
tokens.append(current_block)
268
269
content = line.lstrip("#").strip()
270
current_block = Heading(content, leading(line, "#"))
271
272
i += 1
273
elif line.startswith(">"):
274
if not isinstance(current_block, Blockquote):
275
tokens.append(current_block)
276
277
content = ""
278
279
while i < len(lines) and lines[i].startswith(">"):
280
content += lines[i].lstrip(">").strip() + "\n"
281
i += 1
282
283
current_block = Blockquote(content)
284
else:
285
if not isinstance(current_block, Paragraph):
286
# Paragraph is default
287
288
tokens.append(current_block)
289
290
content = ""
291
292
while i < len(lines) and not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip():
293
content += lines[i].strip() + "\n"
294
i += 1
295
296
current_block = Paragraph(content)
297
298
tokens.append(current_block)
299
300
return tokens
301
302
303
def make_html(ast):
304
soup = beautifulsoup.BeautifulSoup()
305
for i in ast:
306
# Use bs4 to generate HTML
307
if isinstance(i, str):
308
soup.append(i)
309
elif hasattr(i, "content") and i.tag_name != "m-void":
310
tag = soup.new_tag(str(i.tag_name))
311
if i.tag_name == "a":
312
tag["href"] = i.destination
313
if i.tag_name == "img":
314
tag["src"] = i.destination
315
if i.classes:
316
tag["class"] = " ".join(i.classes)
317
try:
318
if isinstance(i.content, list):
319
tag.append(make_html(i.content))
320
elif i.content:
321
tag.string = i.content
322
except AttributeError as exc:
323
# print(i)
324
print(exc, file=sys.stderr)
325
...
326
soup.append(tag)
327
return soup
328
329
330
if __name__ == '__main__':
331
# Generate an AST from a markdown file
332
ast = tokenise(
333
r"""
334
# Hello World!
335
## Title 1
336
### Part 1
337
#### Chapter _1_
338
##### Article 1
339
###### Section 1
340
Lorem **i`p`sum**
341
dolor `sit` amet
342
343
consectetur \
344
*adipiscing* elit
345
346
* * *
347
348
> Make it as simple as possible, [but not simpler](https://wikipedia.org).
349
> > If you can't explain it simply, you don't understand it well enough.
350
351
...
352
"""
353
)
354
for i in ast:
355
print(repr(i))
356
357
# Now convert the AST to HTML
358
print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4)))
359
360