Statistics
| Revision:

root / libsylph / html.c @ 3252

History | View | Annotate | Download (16.5 KB)

1
/*
2
 * LibSylph -- E-Mail client library
3
 * Copyright (C) 1999-2011 Hiroyuki Yamamoto
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2.1 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
 */
19

    
20
#include <glib.h>
21
#include <stdio.h>
22
#include <string.h>
23
#include <ctype.h>
24

    
25
#include "html.h"
26
#include "codeconv.h"
27
#include "utils.h"
28

    
29
#define HTMLBUFSIZE        8192
30
#define HR_STR                "------------------------------------------------"
31

    
32
typedef struct _HTMLSymbol        HTMLSymbol;
33

    
34
struct _HTMLSymbol
35
{
36
        gchar *const key;
37
        gchar *const val;
38
};
39

    
40
static HTMLSymbol symbol_list[] = {
41
        {"&lt;"    , "<"},
42
        {"&gt;"    , ">"},
43
        {"&amp;"   , "&"},
44
        {"&quot;"  , "\""}
45
};
46

    
47
/* &#160; - &#255; */
48
static HTMLSymbol latin_symbol_list[] = {
49
        {"&nbsp;"  , " "},
50
        /* {"&nbsp;"  , "\302\240"}, */
51
        {"&iexcl;" , "\302\241"},
52
        {"&cent;"  , "\302\242"},
53
        {"&pound;" , "\302\243"},
54
        {"&curren;", "\302\244"},
55
        {"&yen;"   , "\302\245"},
56
        {"&brvbar;", "\302\246"},
57
        {"&sect;"  , "\302\247"},
58
        {"&uml;"   , "\302\250"},
59
        {"&copy;"  , "\302\251"},
60
        {"&ordf;"  , "\302\252"},
61
        {"&laquo;" , "\302\253"},
62
        {"&not;"   , "\302\254"},
63
        {"&shy;"   , "\302\255"},
64
        {"&reg;"   , "\302\256"},
65
        {"&macr;"  , "\302\257"},
66
        {"&deg;"   , "\302\260"},
67
        {"&plusm;" , "\302\261"},
68
        {"&sup2;"  , "\302\262"},
69
        {"&sup3;"  , "\302\263"},
70
        {"&acute;" , "\302\264"},
71
        {"&micro;" , "\302\265"},
72
        {"&para;"  , "\302\266"},
73
        {"&middot;", "\302\267"},
74
        {"&cedil;" , "\302\270"},
75
        {"&sup1;"  , "\302\271"},
76
        {"&ordm;"  , "\302\272"},
77
        {"&raquo;" , "\302\273"},
78
        {"&frac14;", "\302\274"},
79
        {"&frac12;", "\302\275"},
80
        {"&frac34;", "\302\276"},
81
        {"&iquest;", "\302\277"},
82

    
83
        {"&Agrave;", "\303\200"},
84
        {"&Aacute;", "\303\201"},
85
        {"&Acirc;" , "\303\202"},
86
        {"&Atilde;", "\303\203"},
87
        {"&Auml;"  , "\303\204"},
88
        {"&Aring;" , "\303\205"},
89
        {"&AElig;" , "\303\206"},
90
        {"&Ccedil;", "\303\207"},
91
        {"&Egrave;", "\303\210"},
92
        {"&Eacute;", "\303\211"},
93
        {"&Ecirc;" , "\303\212"},
94
        {"&Euml;"  , "\303\213"},
95
        {"&Igrave;", "\303\214"},
96
        {"&Iacute;", "\303\215"},
97
        {"&Icirc;" , "\303\216"},
98
        {"&Iuml;"  , "\303\217"},
99
        {"&ETH;"   , "\303\220"},
100
        {"&Ntilde;", "\303\221"},
101
        {"&Ograve;", "\303\222"},
102
        {"&Oacute;", "\303\223"},
103
        {"&Ocirc;" , "\303\224"},
104
        {"&Otilde;", "\303\225"},
105
        {"&Ouml;"  , "\303\226"},
106
        {"&times;" , "\303\227"},
107
        {"&Oslash;", "\303\230"},
108
        {"&Ugrave;", "\303\231"},
109
        {"&Uacute;", "\303\232"},
110
        {"&Ucirc;" , "\303\233"},
111
        {"&Uuml;"  , "\303\234"},
112
        {"&Yacute;", "\303\235"},
113
        {"&THORN;" , "\303\236"},
114
        {"&szlig;" , "\303\237"},
115
        {"&agrave;", "\303\240"},
116
        {"&aacute;", "\303\241"},
117
        {"&acirc;" , "\303\242"},
118
        {"&atilde;", "\303\243"},
119
        {"&auml;"  , "\303\244"},
120
        {"&aring;" , "\303\245"},
121
        {"&aelig;" , "\303\246"},
122
        {"&ccedil;", "\303\247"},
123
        {"&egrave;", "\303\250"},
124
        {"&eacute;", "\303\251"},
125
        {"&ecirc;" , "\303\252"},
126
        {"&euml;"  , "\303\253"},
127
        {"&igrave;", "\303\254"},
128
        {"&iacute;", "\303\255"},
129
        {"&icirc;" , "\303\256"},
130
        {"&iuml;"  , "\303\257"},
131
        {"&eth;"   , "\303\260"},
132
        {"&ntilde;", "\303\261"},
133
        {"&ograve;", "\303\262"},
134
        {"&oacute;", "\303\263"},
135
        {"&ocirc;" , "\303\264"},
136
        {"&otilde;", "\303\265"},
137
        {"&ouml;"  , "\303\266"},
138
        {"&divide;", "\303\267"},
139
        {"&oslash;", "\303\270"},
140
        {"&ugrave;", "\303\271"},
141
        {"&uacute;", "\303\272"},
142
        {"&ucirc;" , "\303\273"},
143
        {"&uuml;"  , "\303\274"},
144
        {"&yacute;", "\303\275"},
145
        {"&thorn;" , "\303\276"},
146
        {"&yuml;"  , "\303\277"}
147
};
148

    
149
static HTMLSymbol other_symbol_list[] = {
150
        /* Non-standard? */
151
        {"&#133;"  , "..."},
152
        {"&#146;"  , "'"},
153
        {"&#150;"  , "-"},
154
        {"&#153;"  , "\xe2\x84\xa2"},
155
        {"&#156;"  , "\xc5\x93"},
156

    
157
        /* Symbolic characters */
158
        {"&trade;" , "\xe2\x84\xa2"},
159

    
160
        /* Latin extended */
161
        {"&OElig;" , "\xc5\x92"},
162
        {"&oelig;" , "\xc5\x93"},
163
        {"&Scaron;", "\xc5\xa0"},
164
        {"&scaron;", "\xc5\xa1"},
165
        {"&Yuml;"  , "\xc5\xb8"},
166
        {"&circ;"  , "\xcb\x86"},
167
        {"&tilde;" , "\xcb\x9c"},
168
        {"&fnof;"  , "\xc6\x92"},
169
};
170

    
171
static GHashTable *default_symbol_table;
172

    
173
static HTMLState html_read_line                (HTMLParser        *parser);
174

    
175
static void html_append_char                (HTMLParser        *parser,
176
                                         gchar                 ch);
177
static void html_append_str                (HTMLParser        *parser,
178
                                         const gchar        *str,
179
                                         gint                 len);
180

    
181
static gchar *html_find_char                (HTMLParser        *parser,
182
                                         gchar                 ch);
183
static gchar *html_find_str                (HTMLParser        *parser,
184
                                         const gchar        *str);
185
static gchar *html_find_str_case        (HTMLParser        *parser,
186
                                         const gchar        *str);
187

    
188
static HTMLState html_parse_tag                (HTMLParser        *parser);
189
static void html_parse_special                (HTMLParser        *parser);
190
static void html_get_parenthesis        (HTMLParser        *parser,
191
                                         gchar                *buf,
192
                                         gint                 len);
193

    
194

    
195
HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv)
196
{
197
        HTMLParser *parser;
198

    
199
        g_return_val_if_fail(fp != NULL, NULL);
200
        g_return_val_if_fail(conv != NULL, NULL);
201

    
202
        parser = g_new0(HTMLParser, 1);
203
        parser->fp = fp;
204
        parser->conv = conv;
205
        parser->str = g_string_new(NULL);
206
        parser->buf = g_string_new(NULL);
207
        parser->bufp = parser->buf->str;
208
        parser->state = HTML_NORMAL;
209
        parser->href = NULL;
210
        parser->newline = TRUE;
211
        parser->empty_line = TRUE;
212
        parser->space = FALSE;
213
        parser->pre = FALSE;
214
        parser->blockquote = 0;
215

    
216
#define SYMBOL_TABLE_ADD(table, list) \
217
{ \
218
        gint i; \
219
 \
220
        for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
221
                g_hash_table_insert(table, list[i].key, list[i].val); \
222
}
223

    
224
        if (!default_symbol_table) {
225
                default_symbol_table =
226
                        g_hash_table_new(g_str_hash, g_str_equal);
227
                SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
228
                SYMBOL_TABLE_ADD(default_symbol_table, latin_symbol_list);
229
                SYMBOL_TABLE_ADD(default_symbol_table, other_symbol_list);
230
        }
231

    
232
#undef SYMBOL_TABLE_ADD
233

    
234
        parser->symbol_table = default_symbol_table;
235

    
236
        return parser;
237
}
238

    
239
void html_parser_destroy(HTMLParser *parser)
240
{
241
        g_string_free(parser->str, TRUE);
242
        g_string_free(parser->buf, TRUE);
243
        g_free(parser->href);
244
        g_free(parser);
245
}
246

    
247
const gchar *html_parse(HTMLParser *parser)
248
{
249
        parser->state = HTML_NORMAL;
250
        g_string_truncate(parser->str, 0);
251

    
252
        if (*parser->bufp == '\0') {
253
                g_string_truncate(parser->buf, 0);
254
                parser->bufp = parser->buf->str;
255
                if (html_read_line(parser) == HTML_EOF)
256
                        return NULL;
257
        }
258

    
259
        while (*parser->bufp != '\0') {
260
                switch (*parser->bufp) {
261
                case '<':
262
                        if (parser->str->len == 0)
263
                                html_parse_tag(parser);
264
                        else
265
                                return parser->str->str;
266
                        break;
267
                case '&':
268
                        html_parse_special(parser);
269
                        break;
270
                case ' ':
271
                case '\t':
272
                case '\r':
273
                case '\n':
274
                        if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
275
                                parser->bufp++;
276

    
277
                        if (!parser->pre) {
278
                                if (!parser->newline)
279
                                        parser->space = TRUE;
280

    
281
                                parser->bufp++;
282
                                break;
283
                        }
284
                        /* fallthrough */
285
                default:
286
                        html_append_char(parser, *parser->bufp++);
287
                }
288
        }
289

    
290
        return parser->str->str;
291
}
292

    
293
static HTMLState html_read_line(HTMLParser *parser)
294
{
295
        gchar buf[HTMLBUFSIZE];
296
        gchar *conv_str;
297
        gint index;
298

    
299
        if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
300
                parser->state = HTML_EOF;
301
                return HTML_EOF;
302
        }
303

    
304
        conv_str = conv_convert(parser->conv, buf);
305
        if (!conv_str) {
306
                index = parser->bufp - parser->buf->str;
307

    
308
                conv_str = conv_utf8todisp(buf, NULL);
309
                g_string_append(parser->buf, conv_str);
310
                g_free(conv_str);
311

    
312
                parser->bufp = parser->buf->str + index;
313

    
314
                return HTML_CONV_FAILED;
315
        }
316

    
317
        index = parser->bufp - parser->buf->str;
318

    
319
        g_string_append(parser->buf, conv_str);
320
        g_free(conv_str);
321

    
322
        parser->bufp = parser->buf->str + index;
323

    
324
        return HTML_NORMAL;
325
}
326

    
327
static void html_append_char(HTMLParser *parser, gchar ch)
328
{
329
        GString *str = parser->str;
330
        const gchar *bq_prefix = NULL;
331

    
332
        if (!parser->pre && parser->space) {
333
                g_string_append_c(str, ' ');
334
                parser->space = FALSE;
335
        }
336

    
337
        if (parser->newline && parser->blockquote > 0)
338
                bq_prefix = "  ";
339

    
340
        parser->empty_line = FALSE;
341
        if (ch == '\n') {
342
                parser->newline = TRUE;
343
                if (str->len > 0 && str->str[str->len - 1] == '\n')
344
                        parser->empty_line = TRUE;
345
        } else
346
                parser->newline = FALSE;
347

    
348
        if (bq_prefix) {
349
                gint i;
350
                for (i = 0; i < parser->blockquote; i++)
351
                        g_string_append(str, bq_prefix);
352
        }
353
        g_string_append_c(str, ch);
354
}
355

    
356
static void html_append_str(HTMLParser *parser, const gchar *str, gint len)
357
{
358
        GString *string = parser->str;
359
        const gchar *bq_prefix = NULL;
360

    
361
        if (!parser->pre && parser->space) {
362
                g_string_append_c(string, ' ');
363
                parser->space = FALSE;
364
        }
365

    
366
        if (len == 0) return;
367

    
368
        if (parser->newline && parser->blockquote > 0)
369
                bq_prefix = "  ";
370

    
371
        if (bq_prefix) {
372
                gint i;
373
                for (i = 0; i < parser->blockquote; i++)
374
                        g_string_append(string, bq_prefix);
375
        }
376

    
377
        if (len < 0)
378
                g_string_append(string, str);
379
        else
380
                g_string_append_len(string, str, len);
381

    
382
        parser->empty_line = FALSE;
383
        if (string->len > 0 && string->str[string->len - 1] == '\n') {
384
                parser->newline = TRUE;
385
                if (string->len > 1 && string->str[string->len - 2] == '\n')
386
                        parser->empty_line = TRUE;
387
        } else
388
                parser->newline = FALSE;
389
}
390

    
391
static gchar *html_find_char(HTMLParser *parser, gchar ch)
392
{
393
        gchar *p;
394

    
395
        while ((p = strchr(parser->bufp, ch)) == NULL) {
396
                if (html_read_line(parser) == HTML_EOF)
397
                        return NULL;
398
        }
399

    
400
        return p;
401
}
402

    
403
static gchar *html_find_str(HTMLParser *parser, const gchar *str)
404
{
405
        gchar *p;
406

    
407
        while ((p = strstr(parser->bufp, str)) == NULL) {
408
                if (html_read_line(parser) == HTML_EOF)
409
                        return NULL;
410
        }
411

    
412
        return p;
413
}
414

    
415
static gchar *html_find_str_case(HTMLParser *parser, const gchar *str)
416
{
417
        gchar *p;
418

    
419
        while ((p = strcasestr(parser->bufp, str)) == NULL) {
420
                if (html_read_line(parser) == HTML_EOF)
421
                        return NULL;
422
        }
423

    
424
        return p;
425
}
426

    
427
static HTMLTag *html_get_tag(const gchar *str)
428
{
429
        HTMLTag *tag;
430
        gchar *tmp;
431
        gchar *tmpp;
432

    
433
        g_return_val_if_fail(str != NULL, NULL);
434

    
435
        if (*str == '\0' || *str == '!') return NULL;
436

    
437
        tmp = g_strdup(str);
438

    
439
        tag = g_new0(HTMLTag, 1);
440

    
441
        for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++) {
442
                if (tmpp > tmp && *tmpp == '/') {
443
                        *tmpp = '\0';
444
                        break;
445
                }
446
        }
447

    
448
        if (*tmpp == '\0') {
449
                g_strdown(tmp);
450
                tag->name = tmp;
451
                return tag;
452
        } else {
453
                *tmpp++ = '\0';
454
                g_strdown(tmp);
455
                tag->name = g_strdup(tmp);
456
        }
457

    
458
        while (*tmpp != '\0') {
459
                HTMLAttr *attr;
460
                gchar *attr_name;
461
                gchar *attr_value;
462
                gchar *p;
463
                gchar quote;
464

    
465
                while (g_ascii_isspace(*tmpp)) tmpp++;
466
                if (tmpp > tmp && *tmpp == '/')
467
                        break;
468
                attr_name = tmpp;
469

    
470
                while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) &&
471
                       *tmpp != '=')
472
                        tmpp++;
473
                if (*tmpp != '\0' && *tmpp != '=') {
474
                        *tmpp++ = '\0';
475
                        while (g_ascii_isspace(*tmpp)) tmpp++;
476
                }
477

    
478
                if (*tmpp == '=') {
479
                        *tmpp++ = '\0';
480
                        while (g_ascii_isspace(*tmpp)) tmpp++;
481

    
482
                        if (*tmpp == '"' || *tmpp == '\'') {
483
                                /* name="value" */
484
                                quote = *tmpp;
485
                                tmpp++;
486
                                attr_value = tmpp;
487
                                if ((p = strchr(attr_value, quote)) == NULL) {
488
                                        g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
489
                                        break;
490
                                }
491
                                tmpp = p;
492
                                *tmpp++ = '\0';
493
                                while (g_ascii_isspace(*tmpp)) tmpp++;
494
                        } else {
495
                                /* name=value */
496
                                attr_value = tmpp;
497
                                while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++;
498
                                if (*tmpp != '\0')
499
                                        *tmpp++ = '\0';
500
                        }
501
                } else
502
                        attr_value = "";
503

    
504
                g_strchomp(attr_name);
505
                g_strdown(attr_name);
506
                attr = g_new(HTMLAttr, 1);
507
                attr->name = g_strdup(attr_name);
508
                attr->value = g_strdup(attr_value);
509
                tag->attr = g_list_append(tag->attr, attr);
510
        }
511

    
512
        g_free(tmp);
513

    
514
        return tag;
515
}
516

    
517
static void html_free_tag(HTMLTag *tag)
518
{
519
        if (!tag) return;
520

    
521
        g_free(tag->name);
522
        while (tag->attr != NULL) {
523
                HTMLAttr *attr = (HTMLAttr *)tag->attr->data;
524
                g_free(attr->name);
525
                g_free(attr->value);
526
                g_free(attr);
527
                tag->attr = g_list_remove(tag->attr, tag->attr->data);
528
        }
529
        g_free(tag);
530
}
531

    
532
static HTMLState html_parse_tag(HTMLParser *parser)
533
{
534
        gchar buf[HTMLBUFSIZE];
535
        HTMLTag *tag;
536

    
537
        html_get_parenthesis(parser, buf, sizeof(buf));
538

    
539
        tag = html_get_tag(buf);
540

    
541
        parser->state = HTML_UNKNOWN;
542
        if (!tag) return HTML_UNKNOWN;
543

    
544
        if (!strcmp(tag->name, "br")) {
545
                parser->space = FALSE;
546
                html_append_char(parser, '\n');
547
                parser->state = HTML_BR;
548
        } else if (!strcmp(tag->name, "a")) {
549
                GList *cur;
550

    
551
                for (cur = tag->attr; cur != NULL; cur = cur->next) {
552
                        HTMLAttr *attr = (HTMLAttr *)cur->data;
553

    
554
                        if (attr && !strcmp(attr->name, "href")) {
555
                                g_free(parser->href);
556
                                parser->href = g_strdup(attr->value);
557
                                parser->state = HTML_HREF;
558
                                break;
559
                        }
560
                }
561
        } else if (!strcmp(tag->name, "/a")) {
562
                g_free(parser->href);
563
                parser->href = NULL;
564
                parser->state = HTML_NORMAL;
565
        } else if (!strcmp(tag->name, "p")) {
566
                parser->space = FALSE;
567
                if (!parser->empty_line) {
568
                        parser->space = FALSE;
569
                        if (!parser->newline) html_append_char(parser, '\n');
570
                        html_append_char(parser, '\n');
571
                }
572
                parser->state = HTML_PAR;
573
        } else if (!strcmp(tag->name, "pre")) {
574
                parser->pre = TRUE;
575
                parser->state = HTML_PRE;
576
        } else if (!strcmp(tag->name, "/pre")) {
577
                parser->pre = FALSE;
578
                parser->state = HTML_NORMAL;
579
        } else if (!strcmp(tag->name, "blockquote")) {
580
                parser->blockquote++;
581
                parser->state = HTML_BLOCKQUOTE;
582
        } else if (!strcmp(tag->name, "/blockquote")) {
583
                parser->blockquote--;
584
                if (parser->blockquote < 0)
585
                        parser->blockquote = 0;
586
                parser->state = HTML_NORMAL;
587
        } else if (!strcmp(tag->name, "hr")) {
588
                if (!parser->newline) {
589
                        parser->space = FALSE;
590
                        html_append_char(parser, '\n');
591
                }
592
                html_append_str(parser, HR_STR "\n", -1);
593
                parser->state = HTML_HR;
594
        } else if (!strcmp(tag->name, "div")    ||
595
                   !strcmp(tag->name, "ul")     ||
596
                   !strcmp(tag->name, "li")     ||
597
                   !strcmp(tag->name, "table")  ||
598
                   !strcmp(tag->name, "tr")     ||
599
                   (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1]))) {
600
                if (!parser->newline) {
601
                        parser->space = FALSE;
602
                        html_append_char(parser, '\n');
603
                }
604
                parser->state = HTML_NORMAL;
605
        } else if (!strcmp(tag->name, "/table") ||
606
                   (tag->name[0] == '/' &&
607
                    tag->name[1] == 'h' &&
608
                    g_ascii_isdigit(tag->name[1]))) {
609
                if (!parser->empty_line) {
610
                        parser->space = FALSE;
611
                        if (!parser->newline) html_append_char(parser, '\n');
612
                        html_append_char(parser, '\n');
613
                }
614
                parser->state = HTML_NORMAL;
615
        } else if (!strcmp(tag->name, "/div")   ||
616
                   !strcmp(tag->name, "/ul")    ||
617
                   !strcmp(tag->name, "/li")) {
618
                if (!parser->newline) {
619
                        parser->space = FALSE;
620
                        html_append_char(parser, '\n');
621
                }
622
                parser->state = HTML_NORMAL;
623
        }
624

    
625
        html_free_tag(tag);
626

    
627
        return parser->state;
628
}
629

    
630
static void html_parse_special(HTMLParser *parser)
631
{
632
        gchar symbol_name[9];
633
        gint n;
634
        const gchar *val;
635

    
636
        parser->state = HTML_UNKNOWN;
637
        g_return_if_fail(*parser->bufp == '&');
638

    
639
        /* &foo; */
640
        for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
641
                ;
642
        if (n > 7 || parser->bufp[n] != ';') {
643
                /* output literal `&' */
644
                html_append_char(parser, *parser->bufp++);
645
                parser->state = HTML_NORMAL;
646
                return;
647
        }
648
        strncpy2(symbol_name, parser->bufp, n + 2);
649
        parser->bufp += n + 1;
650

    
651
        if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
652
            != NULL) {
653
                html_append_str(parser, val, -1);
654
                parser->state = HTML_NORMAL;
655
                return;
656
        } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) {
657
                gint ch;
658

    
659
                ch = atoi(symbol_name + 2);
660
                if (ch < 128 && g_ascii_isprint(ch)) {
661
                        html_append_char(parser, ch);
662
                        parser->state = HTML_NORMAL;
663
                        return;
664
                } else {
665
                        /* ISO 10646 to UTF-8 */
666
                        gchar buf[6];
667
                        gint len;
668

    
669
                        len = g_unichar_to_utf8((gunichar)ch, buf);
670
                        if (len > 0) {
671
                                html_append_str(parser, buf, len);
672
                                parser->state = HTML_NORMAL;
673
                                return;
674
                        }
675
                }
676
        }
677

    
678
        html_append_str(parser, symbol_name, -1);
679
}
680

    
681
static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len)
682
{
683
        gchar *p;
684

    
685
        buf[0] = '\0';
686
        g_return_if_fail(*parser->bufp == '<');
687

    
688
        /* ignore comment / CSS / script stuff */
689
        if (!strncmp(parser->bufp, "<!--", 4)) {
690
                parser->bufp += 4;
691
                if ((p = html_find_str(parser, "-->")) != NULL)
692
                        parser->bufp = p + 3;
693
                return;
694
        }
695
        if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) {
696
                parser->bufp += 6;
697
                if ((p = html_find_str_case(parser, "</style")) != NULL) {
698
                        parser->bufp = p + 7;
699
                        if ((p = html_find_char(parser, '>')) != NULL)
700
                                parser->bufp = p + 1;
701
                }
702
                return;
703
        }
704
        if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) {
705
                parser->bufp += 7;
706
                if ((p = html_find_str_case(parser, "</script")) != NULL) {
707
                        parser->bufp = p + 8;
708
                        if ((p = html_find_char(parser, '>')) != NULL)
709
                                parser->bufp = p + 1;
710
                }
711
                return;
712
        }
713

    
714
        parser->bufp++;
715
        if ((p = html_find_char(parser, '>')) == NULL)
716
                return;
717

    
718
        strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
719
        g_strstrip(buf);
720
        parser->bufp = p + 1;
721
}