markdown-viewer/markdown_parser.py at main · reichenbach/markdown-viewer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Lightweight Markdown parser for Python 2.5 / PowerPC Mac.
Converts Markdown text into a list of tagged segments for Tkinter Text widget.

Each segment is a tuple: (text, [tag1, tag2, ...])
"""

import re


class MarkdownParser(object):
    """Parses a subset of Markdown into tagged segments for display."""

    def __init__(self):
        # Inline patterns (order matters - bold before italic)
        # Italic patterns use negative lookbehind/lookahead for * to avoid
        # matching inside ** bold ** delimiters.
        self.inline_patterns = [
            # Bold + Italic
            (re.compile(r'\*\*\*(.+?)\*\*\*'), ['bold', 'italic']),
            (re.compile(r'___(.+?)___'), ['bold', 'italic']),
            # Bold
            (re.compile(r'\*\*(.+?)\*\*'), ['bold']),
            (re.compile(r'__(.+?)__'), ['bold']),
            # Italic — closing * must not be followed by * (avoid stealing from **)
            (re.compile(r'(?<!\*)\*(?!\*)(.+?)(?<!\*)\*(?!\*)'), ['italic']),
            (re.compile(r'(?<!_)_(?!_)(.+?)(?<!_)_(?!_)'), ['italic']),
            # Inline code
            (re.compile(r'`(.+?)`'), ['code_inline']),
            # Strikethrough
            (re.compile(r'~~(.+?)~~'), ['strikethrough']),
        ]

        # Link pattern: [text](url)
        self.link_pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
        # Image pattern: ![alt](path)
        self.image_pattern = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')

    def parse(self, text):
        """Parse markdown text and return list of (text, tags) segments."""
        lines = text.split('\n')
        segments = []
        in_code_block = False
        code_block_lines = []

        i = 0
        while i < len(lines):
            line = lines[i]

            # Fenced code blocks
            if line.strip().startswith('```'):
                if in_code_block:
                    code_text = '\n'.join(code_block_lines)
                    if code_text:
                        segments.append((code_text + '\n', ['code_block']))
                    code_block_lines = []
                    in_code_block = False
                else:
                    in_code_block = True
                i += 1
                continue

            if in_code_block:
                code_block_lines.append(line)
                i += 1
                continue

            # Blank line
            if line.strip() == '':
                segments.append(('\n', ['normal']))
                i += 1
                continue

            # Headings (ATX style)
            heading_match = re.match(r'^(#{1,6})\s+(.+?)(?:\s*#*\s*)?$', line)
            if heading_match:
                level = len(heading_match.group(1))
                text_content = heading_match.group(2)
                tag = 'h%d' % level
                segments.append((text_content + '\n', [tag]))
                i += 1
                continue

            # Setext-style headings
            if i + 1 < len(lines):
                next_line = lines[i + 1].strip()
                if next_line and all(c == '=' for c in next_line) and len(next_line) >= 2:
                    segments.append((line + '\n', ['h1']))
                    i += 2
                    continue
                if next_line and all(c == '-' for c in next_line) and len(next_line) >= 2:
                    segments.append((line + '\n', ['h2']))
                    i += 2
                    continue

            # Horizontal rule
            if re.match(r'^(\*{3,}|-{3,}|_{3,})\s*$', line.strip()):
                segments.append(('-' * 40 + '\n', ['hr']))
                i += 1
                continue

            # Unordered list items
            list_match = re.match(r'^(\s*)[*\-+]\s+(.+)$', line)
            if list_match:
                indent = len(list_match.group(1))
                bullet_level = indent // 2
                prefix = '  ' * bullet_level + '* '
                content = list_match.group(2)
                segments.append((prefix, ['list_bullet']))
                self._parse_inline(content + '\n', segments, ['list_item'])
                i += 1
                continue

            # Ordered list items
            olist_match = re.match(r'^(\s*)(\d+)[.)]\s+(.+)$', line)
            if olist_match:
                indent = len(olist_match.group(1))
                number = olist_match.group(2)
                bullet_level = indent // 2
                prefix = '  ' * bullet_level + number + '. '
                content = olist_match.group(3)
                segments.append((prefix, ['list_bullet']))
                self._parse_inline(content + '\n', segments, ['list_item'])
                i += 1
                continue

            # Blockquote
            bq_match = re.match(r'^>\s?(.*)', line)
            if bq_match:
                content = bq_match.group(1)
                segments.append(('  | ', ['blockquote_bar']))
                self._parse_inline(content + '\n', segments, ['blockquote'])
                i += 1
                continue

            # Normal paragraph
            self._parse_inline(line + '\n', segments, ['normal'])
            i += 1

        # Handle unclosed code block
        if in_code_block and code_block_lines:
            code_text = '\n'.join(code_block_lines)
            segments.append((code_text + '\n', ['code_block']))

        return segments

    def _parse_inline(self, text, segments, base_tags):
        """Parse inline formatting within text, with recursion for nesting."""
        while text:
            earliest_match = None
            earliest_start = len(text)
            earliest_pattern_tags = None
            earliest_type = 'format'  # 'format', 'link', or 'image'

            # Check images first (before links, since ![...] starts with !)
            img_m = self.image_pattern.search(text)
            if img_m and img_m.start() < earliest_start:
                earliest_match = img_m
                earliest_start = img_m.start()
                earliest_type = 'image'

            # Check links
            link_m = self.link_pattern.search(text)
            if link_m and link_m.start() < earliest_start:
                # Make sure this isn't part of an image (preceded by !)
                if link_m.start() == 0 or text[link_m.start() - 1] != '!':
                    earliest_match = link_m
                    earliest_start = link_m.start()
                    earliest_type = 'link'

            # Check inline formatting patterns
            for pattern, tags in self.inline_patterns:
                m = pattern.search(text)
                if m and m.start() < earliest_start:
                    earliest_match = m
                    earliest_start = m.start()
                    earliest_pattern_tags = tags
                    earliest_type = 'format'

            if earliest_match is None:
                if text:
                    segments.append((text, list(base_tags)))
                break

            # Text before the match
            if earliest_start > 0:
                segments.append((text[:earliest_start], list(base_tags)))

            if earliest_type == 'image':
                alt_text = earliest_match.group(1) or 'image'
                img_path = earliest_match.group(2)
                segments.append(('[', list(base_tags)))
                segments.append(('img', list(base_tags) + ['image_icon']))
                segments.append((': ', list(base_tags)))
                if alt_text:
                    segments.append((alt_text, list(base_tags) + ['bold']))
                segments.append((' \u2192 ', list(base_tags)))
                segments.append((img_path, list(base_tags) + ['link_url']))
                segments.append((']', list(base_tags)))
            elif earliest_type == 'link':
                link_text = earliest_match.group(1)
                link_url = earliest_match.group(2)
                segments.append((link_text, list(base_tags) + ['link_text']))
                segments.append((' (', list(base_tags)))
                segments.append((link_url, list(base_tags) + ['link_url']))
                segments.append((')', list(base_tags)))
            else:
                # Inline formatting - recurse for nested formatting
                inner_text = earliest_match.group(1)
                combined_tags = list(base_tags) + list(earliest_pattern_tags)
                # Recurse to handle nested inline formatting (e.g. bold inside italic)
                # But don't recurse for code_inline - it should be literal
                if 'code_inline' in earliest_pattern_tags:
                    segments.append((inner_text, combined_tags))
                else:
                    self._parse_inline(inner_text, segments, combined_tags)

            text = text[earliest_match.end():]


if __name__ == '__main__':
    test = """# Hello World

This is **bold** and *italic* and `code`.

## Lists

- Item one
- Item **two**
- Item three

> This is a blockquote

```
def hello():
    print "world"
```

Normal paragraph with ***bold italic*** text.
"""
    parser = MarkdownParser()
    result = parser.parse(test)
    for seg in result:
        print repr(seg)