By using this site, you agree to have cookies stored on your device, strictly for functional purposes, such as storing your session and preferences.

Dismiss

 markdown.py

View raw Download
text/x-script.python • 7.67 kiB
Python script, ASCII text executable
        
            
1
import re
2
import bs4 as beautifulsoup
3
import sys
4
5
inlineRegex = r"""
6
(?P<em>[*_]{1,7}) (?P<textEm>(?:\\[*]|[^*])*) (?P=em) # emphasis
7
|
8
[`] (?P<textCode>(?:\\[`]|[^`])*) [`] # inline code
9
|
10
(?P<imageFlag>!?) \[ (?P<urlText>[^\[\]]*) \] \((?P<urlDestination>[^\(\)]*)\) # hyperlink or media
11
|
12
(?P<strike>~{2}) (?P<textStrike>(?:\\[~]|[^~])*) (~{2}) # strikethrough
13
|
14
(?P<diff>\-\-|\+\+) (?P<textDiff>(?:\\[-+]|[^-+])*) (?P=diff) # diffs
15
"""
16
17
18
def leading(string, character):
19
return len(string) - len(string.lstrip(character))
20
21
22
def trailing(string, character):
23
return len(string) - len(string.rstrip(character))
24
25
26
class Element:
27
def __init__(self):
28
self.classes = []
29
self.content = None
30
pass
31
32
def __repr__(self):
33
return "Void block"
34
35
@property
36
def tag_name(self):
37
return "m-void"
38
39
40
class Container(Element):
41
def __init__(self, content):
42
super().__init__()
43
self.content = parse_line(content)
44
45
def __repr__(self):
46
return "Generic container element: " + repr(self.content)
47
48
49
class Heading(Container):
50
def __init__(self, content, level):
51
super().__init__(content)
52
self.level = level
53
pass
54
55
def __repr__(self):
56
return f"Heading level {self.level}:\n\t" + repr(self.content)
57
58
@property
59
def tag_name(self):
60
return "h" + str(self.level)
61
62
63
class Paragraph(Container):
64
def __init__(self, content):
65
super().__init__("")
66
self.content = parse_line(content)
67
68
def __repr__(self):
69
return "Paragraph:\n\t" + repr(self.content)
70
71
@property
72
def tag_name(self):
73
return "p"
74
75
76
class Blockquote(Paragraph):
77
def __init__(self, content):
78
super().__init__("")
79
self.content = tokenise(content)
80
81
def __repr__(self):
82
return "Blockquote:\n\t" + repr(self.content)
83
84
@property
85
def tag_name(self):
86
return "blockquote"
87
88
89
class Emphasis(Container):
90
def __init__(self, content, value):
91
super().__init__(content)
92
self.value = value
93
if value >= 4:
94
self.classes.append("emphasis-3")
95
if value % 4 >= 2:
96
self.classes.append("emphasis-2")
97
if value % 2:
98
self.classes.append("emphasis-1")
99
100
def __repr__(self):
101
return f"Emphasis ({self.value}): " + repr(self.content)
102
103
@property
104
def tag_name(self):
105
return "em" if self.value == 1 else "strong"
106
107
108
class Code(Element):
109
def __init__(self, content):
110
super().__init__()
111
self.content = [content]
112
113
def __repr__(self):
114
return f"Inline code: {self.content}"
115
116
@property
117
def tag_name(self):
118
return "code"
119
120
121
class Strikethrough(Container):
122
def __init__(self, content):
123
super().__init__(content)
124
125
def __repr__(self):
126
return f"Strikethrough: {repr(self.content)}"
127
128
@property
129
def tag_name(self):
130
return "s"
131
132
133
class Diff(Container):
134
def __init__(self, content, value):
135
super().__init__(content)
136
self.value = value
137
138
def __repr__(self):
139
return f"Diff ({self.value}): {self.content}"
140
141
@property
142
def tag_name(self):
143
return "ins" if self.value == "++" else "del"
144
145
146
class Link(Element):
147
def __init__(self, content, destination, image=False):
148
super().__init__()
149
self.content = content
150
self.destination = destination
151
self.image = image
152
153
def __repr__(self):
154
return f"{'Image' if self.image else 'Link'}: {self.content} -> {self.destination}"
155
156
@property
157
def tag_name(self):
158
return "a"
159
160
161
class Image(Link):
162
def __init__(self, text, destination):
163
super().__init__(text, destination, True)
164
165
@property
166
def tag_name(self):
167
return "img"
168
169
170
def parse_line(source):
171
if trailing(source, "\\") == 1:
172
source = source.rstrip("\\")
173
source += "\n"
174
175
tokens = []
176
pattern = re.compile(inlineRegex, re.MULTILINE | re.DOTALL | re.VERBOSE)
177
matches = pattern.finditer(source)
178
179
lookup = 0
180
for i in matches:
181
l = i.start()
182
r = i.end()
183
tokens.append(source[lookup:l])
184
185
lookup = r
186
187
if i.group("em"):
188
tokens.append(Emphasis(i.group("textEm"), len(i.group("em"))))
189
if i.group("textCode"):
190
tokens.append(Code(i.group("textCode")))
191
if i.group("strike"):
192
tokens.append(Strikethrough(i.group("textStrike")))
193
if i.group("diff"):
194
tokens.append(Diff(i.group("textDiff"), i.group("diff")))
195
if i.group("urlText"):
196
if i.group("imageFlag"):
197
tokens.append(Image(i.group("urlText"), i.group("urlDestination")))
198
else:
199
tokens.append(Link(i.group("urlText"), i.group("urlDestination")))
200
201
tokens.append(source[lookup:])
202
203
return tokens
204
205
206
def tokenise(source):
207
tokens = []
208
209
current_block = Element()
210
211
lines = source.split("\n")
212
213
i = 0
214
while i < len(lines):
215
line = lines[i]
216
if not line.strip():
217
# Void block
218
219
tokens.append(current_block)
220
current_block = Element()
221
222
i += 1
223
elif line.startswith("#") and leading(line.lstrip("#"), " "):
224
tokens.append(current_block)
225
226
content = line.lstrip("#").strip()
227
current_block = Heading(content, leading(line, "#"))
228
229
i += 1
230
elif line.startswith(">"):
231
if not isinstance(current_block, Blockquote):
232
tokens.append(current_block)
233
234
content = ""
235
236
while i < len(lines) and lines[i].startswith(">"):
237
content += lines[i].lstrip(">").strip() + "\n"
238
i += 1
239
240
current_block = Blockquote(content)
241
else:
242
if not isinstance(current_block, Paragraph):
243
# Paragraph is default
244
245
tokens.append(current_block)
246
247
content = ""
248
249
while i < len(lines) and not lines[i].startswith("#") and not lines[i].startswith(">") and lines[i].strip():
250
content += lines[i].strip() + "\n"
251
i += 1
252
253
current_block = Paragraph(content)
254
255
tokens.append(current_block)
256
257
return tokens
258
259
260
def make_html(ast):
261
soup = beautifulsoup.BeautifulSoup()
262
for i in ast:
263
# Use bs4 to generate HTML
264
if isinstance(i, str):
265
soup.append(i)
266
elif hasattr(i, "content") and i.tag_name != "m-void":
267
tag = soup.new_tag(str(i.tag_name))
268
if i.tag_name == "a":
269
tag["href"] = i.destination
270
if i.tag_name == "img":
271
tag["src"] = i.destination
272
if i.classes:
273
tag["class"] = " ".join(i.classes)
274
try:
275
if isinstance(i.content, list):
276
tag.append(make_html(i.content))
277
elif i.content:
278
tag.string = i.content
279
except AttributeError as exc:
280
# print(i)
281
print(exc, file=sys.stderr)
282
...
283
soup.append(tag)
284
return soup
285
286
287
if __name__ == '__main__':
288
# Generate an AST from a markdown file
289
ast = tokenise(
290
"""
291
# Hello World!
292
## Title 1
293
### Part 1
294
#### Chapter _1_
295
##### Article 1
296
###### Section 1
297
Lorem **i`p`sum**
298
dolor `sit` amet
299
300
> Make it as simple as possible, [but not simpler](https://wikipedia.org).
301
> > If you can't explain it simply, you don't understand it well enough.
302
303
...
304
"""
305
)
306
for i in ast:
307
print(repr(i))
308
309
# Now convert the AST to HTML
310
print(make_html(ast).prettify(formatter=beautifulsoup.formatter.HTMLFormatter(indent=4)))
311
312