Statistics
| Revision:

root / libsylph / html.c @ 3273

History | View | Annotate | Download (17.9 KB)

1
/*
2
 * LibSylph -- E-Mail client library
3
 * Copyright (C) 1999-2011 Hiroyuki Yamamoto
4
 *
5
 * This library is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU Lesser General Public
7
 * License as published by the Free Software Foundation; either
8
 * version 2.1 of the License, or (at your option) any later version.
9
 *
10
 * This library is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * Lesser General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU Lesser General Public
16
 * License along with this library; if not, write to the Free Software
17
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18
 */
19

    
20
#include <glib.h>
21
#include <stdio.h>
22
#include <string.h>
23
#include <ctype.h>
24

    
25
#include "html.h"
26
#include "codeconv.h"
27
#include "utils.h"
28

    
29
#define HTMLBUFSIZE        8192
30
#define HR_STR                "------------------------------------------------"
31

    
32
typedef struct _HTMLSymbol        HTMLSymbol;
33

    
34
struct _HTMLSymbol
35
{
36
        gchar *const key;
37
        gchar *const val;
38
};
39

    
40
static HTMLSymbol symbol_list[] = {
41
        {"&lt;"    , "<"},
42
        {"&gt;"    , ">"},
43
        {"&amp;"   , "&"},
44
        {"&quot;"  , "\""}
45
};
46

    
47
/* &#160; - &#255; */
48
static HTMLSymbol latin_symbol_list[] = {
49
        {"&nbsp;"  , " "},
50
        /* {"&nbsp;"  , "\302\240"}, */
51
        {"&iexcl;" , "\302\241"},
52
        {"&cent;"  , "\302\242"},
53
        {"&pound;" , "\302\243"},
54
        {"&curren;", "\302\244"},
55
        {"&yen;"   , "\302\245"},
56
        {"&brvbar;", "\302\246"},
57
        {"&sect;"  , "\302\247"},
58
        {"&uml;"   , "\302\250"},
59
        {"&copy;"  , "\302\251"},
60
        {"&ordf;"  , "\302\252"},
61
        {"&laquo;" , "\302\253"},
62
        {"&not;"   , "\302\254"},
63
        {"&shy;"   , "\302\255"},
64
        {"&reg;"   , "\302\256"},
65
        {"&macr;"  , "\302\257"},
66
        {"&deg;"   , "\302\260"},
67
        {"&plusm;" , "\302\261"},
68
        {"&sup2;"  , "\302\262"},
69
        {"&sup3;"  , "\302\263"},
70
        {"&acute;" , "\302\264"},
71
        {"&micro;" , "\302\265"},
72
        {"&para;"  , "\302\266"},
73
        {"&middot;", "\302\267"},
74
        {"&cedil;" , "\302\270"},
75
        {"&sup1;"  , "\302\271"},
76
        {"&ordm;"  , "\302\272"},
77
        {"&raquo;" , "\302\273"},
78
        {"&frac14;", "\302\274"},
79
        {"&frac12;", "\302\275"},
80
        {"&frac34;", "\302\276"},
81
        {"&iquest;", "\302\277"},
82

    
83
        {"&Agrave;", "\303\200"},
84
        {"&Aacute;", "\303\201"},
85
        {"&Acirc;" , "\303\202"},
86
        {"&Atilde;", "\303\203"},
87
        {"&Auml;"  , "\303\204"},
88
        {"&Aring;" , "\303\205"},
89
        {"&AElig;" , "\303\206"},
90
        {"&Ccedil;", "\303\207"},
91
        {"&Egrave;", "\303\210"},
92
        {"&Eacute;", "\303\211"},
93
        {"&Ecirc;" , "\303\212"},
94
        {"&Euml;"  , "\303\213"},
95
        {"&Igrave;", "\303\214"},
96
        {"&Iacute;", "\303\215"},
97
        {"&Icirc;" , "\303\216"},
98
        {"&Iuml;"  , "\303\217"},
99
        {"&ETH;"   , "\303\220"},
100
        {"&Ntilde;", "\303\221"},
101
        {"&Ograve;", "\303\222"},
102
        {"&Oacute;", "\303\223"},
103
        {"&Ocirc;" , "\303\224"},
104
        {"&Otilde;", "\303\225"},
105
        {"&Ouml;"  , "\303\226"},
106
        {"&times;" , "\303\227"},
107
        {"&Oslash;", "\303\230"},
108
        {"&Ugrave;", "\303\231"},
109
        {"&Uacute;", "\303\232"},
110
        {"&Ucirc;" , "\303\233"},
111
        {"&Uuml;"  , "\303\234"},
112
        {"&Yacute;", "\303\235"},
113
        {"&THORN;" , "\303\236"},
114
        {"&szlig;" , "\303\237"},
115
        {"&agrave;", "\303\240"},
116
        {"&aacute;", "\303\241"},
117
        {"&acirc;" , "\303\242"},
118
        {"&atilde;", "\303\243"},
119
        {"&auml;"  , "\303\244"},
120
        {"&aring;" , "\303\245"},
121
        {"&aelig;" , "\303\246"},
122
        {"&ccedil;", "\303\247"},
123
        {"&egrave;", "\303\250"},
124
        {"&eacute;", "\303\251"},
125
        {"&ecirc;" , "\303\252"},
126
        {"&euml;"  , "\303\253"},
127
        {"&igrave;", "\303\254"},
128
        {"&iacute;", "\303\255"},
129
        {"&icirc;" , "\303\256"},
130
        {"&iuml;"  , "\303\257"},
131
        {"&eth;"   , "\303\260"},
132
        {"&ntilde;", "\303\261"},
133
        {"&ograve;", "\303\262"},
134
        {"&oacute;", "\303\263"},
135
        {"&ocirc;" , "\303\264"},
136
        {"&otilde;", "\303\265"},
137
        {"&ouml;"  , "\303\266"},
138
        {"&divide;", "\303\267"},
139
        {"&oslash;", "\303\270"},
140
        {"&ugrave;", "\303\271"},
141
        {"&uacute;", "\303\272"},
142
        {"&ucirc;" , "\303\273"},
143
        {"&uuml;"  , "\303\274"},
144
        {"&yacute;", "\303\275"},
145
        {"&thorn;" , "\303\276"},
146
        {"&yuml;"  , "\303\277"}
147
};
148

    
149
static HTMLSymbol other_symbol_list[] = {
150
        /* Non-standard? */
151
        {"&#133;"  , "..."},
152
        {"&#146;"  , "'"},
153
        {"&#150;"  , "-"},
154
        {"&#153;"  , "\xe2\x84\xa2"},
155
        {"&#156;"  , "\xc5\x93"},
156

    
157
        /* Symbolic characters */
158
        {"&trade;" , "\xe2\x84\xa2"},
159

    
160
        /* Latin extended */
161
        {"&OElig;" , "\xc5\x92"},
162
        {"&oelig;" , "\xc5\x93"},
163
        {"&Scaron;", "\xc5\xa0"},
164
        {"&scaron;", "\xc5\xa1"},
165
        {"&Yuml;"  , "\xc5\xb8"},
166
        {"&circ;"  , "\xcb\x86"},
167
        {"&tilde;" , "\xcb\x9c"},
168
        {"&fnof;"  , "\xc6\x92"},
169
};
170

    
171
static GHashTable *default_symbol_table;
172

    
173
static HTMLState html_read_line                (HTMLParser        *parser);
174

    
175
static void html_append_char                (HTMLParser        *parser,
176
                                         gchar                 ch);
177
static void html_append_str                (HTMLParser        *parser,
178
                                         const gchar        *str,
179
                                         gint                 len);
180

    
181
static gchar *html_find_char                (HTMLParser        *parser,
182
                                         gchar                 ch);
183
static gchar *html_find_str                (HTMLParser        *parser,
184
                                         const gchar        *str);
185
static gchar *html_find_str_case        (HTMLParser        *parser,
186
                                         const gchar        *str);
187

    
188
static HTMLState html_parse_tag                (HTMLParser        *parser);
189
static void html_parse_special                (HTMLParser        *parser);
190
static void html_get_parenthesis        (HTMLParser        *parser,
191
                                         gchar                *buf,
192
                                         gint                 len);
193

    
194
static gchar *html_unescape_str                (HTMLParser        *parser,
195
                                         const gchar        *str);
196

    
197

    
198
HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv)
199
{
200
        HTMLParser *parser;
201

    
202
        g_return_val_if_fail(fp != NULL, NULL);
203
        g_return_val_if_fail(conv != NULL, NULL);
204

    
205
        parser = g_new0(HTMLParser, 1);
206
        parser->fp = fp;
207
        parser->conv = conv;
208
        parser->str = g_string_new(NULL);
209
        parser->buf = g_string_new(NULL);
210
        parser->bufp = parser->buf->str;
211
        parser->state = HTML_NORMAL;
212
        parser->href = NULL;
213
        parser->newline = TRUE;
214
        parser->empty_line = TRUE;
215
        parser->space = FALSE;
216
        parser->pre = FALSE;
217
        parser->blockquote = 0;
218

    
219
#define SYMBOL_TABLE_ADD(table, list) \
220
{ \
221
        gint i; \
222
 \
223
        for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
224
                g_hash_table_insert(table, list[i].key, list[i].val); \
225
}
226

    
227
        if (!default_symbol_table) {
228
                default_symbol_table =
229
                        g_hash_table_new(g_str_hash, g_str_equal);
230
                SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
231
                SYMBOL_TABLE_ADD(default_symbol_table, latin_symbol_list);
232
                SYMBOL_TABLE_ADD(default_symbol_table, other_symbol_list);
233
        }
234

    
235
#undef SYMBOL_TABLE_ADD
236

    
237
        parser->symbol_table = default_symbol_table;
238

    
239
        return parser;
240
}
241

    
242
void html_parser_destroy(HTMLParser *parser)
243
{
244
        g_string_free(parser->str, TRUE);
245
        g_string_free(parser->buf, TRUE);
246
        g_free(parser->href);
247
        g_free(parser);
248
}
249

    
250
const gchar *html_parse(HTMLParser *parser)
251
{
252
        parser->state = HTML_NORMAL;
253
        g_string_truncate(parser->str, 0);
254

    
255
        if (*parser->bufp == '\0') {
256
                g_string_truncate(parser->buf, 0);
257
                parser->bufp = parser->buf->str;
258
                if (html_read_line(parser) == HTML_EOF)
259
                        return NULL;
260
        }
261

    
262
        while (*parser->bufp != '\0') {
263
                switch (*parser->bufp) {
264
                case '<':
265
                        if (parser->str->len == 0)
266
                                html_parse_tag(parser);
267
                        else
268
                                return parser->str->str;
269
                        break;
270
                case '&':
271
                        html_parse_special(parser);
272
                        break;
273
                case ' ':
274
                case '\t':
275
                case '\r':
276
                case '\n':
277
                        if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
278
                                parser->bufp++;
279

    
280
                        if (!parser->pre) {
281
                                if (!parser->newline)
282
                                        parser->space = TRUE;
283

    
284
                                parser->bufp++;
285
                                break;
286
                        }
287
                        /* fallthrough */
288
                default:
289
                        html_append_char(parser, *parser->bufp++);
290
                }
291
        }
292

    
293
        return parser->str->str;
294
}
295

    
296
static HTMLState html_read_line(HTMLParser *parser)
297
{
298
        gchar buf[HTMLBUFSIZE];
299
        gchar *conv_str;
300
        gint index;
301

    
302
        if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
303
                parser->state = HTML_EOF;
304
                return HTML_EOF;
305
        }
306

    
307
        conv_str = conv_convert(parser->conv, buf);
308
        if (!conv_str) {
309
                index = parser->bufp - parser->buf->str;
310

    
311
                conv_str = conv_utf8todisp(buf, NULL);
312
                g_string_append(parser->buf, conv_str);
313
                g_free(conv_str);
314

    
315
                parser->bufp = parser->buf->str + index;
316

    
317
                return HTML_CONV_FAILED;
318
        }
319

    
320
        index = parser->bufp - parser->buf->str;
321

    
322
        g_string_append(parser->buf, conv_str);
323
        g_free(conv_str);
324

    
325
        parser->bufp = parser->buf->str + index;
326

    
327
        return HTML_NORMAL;
328
}
329

    
330
static void html_append_char(HTMLParser *parser, gchar ch)
331
{
332
        GString *str = parser->str;
333
        const gchar *bq_prefix = NULL;
334

    
335
        if (!parser->pre && parser->space) {
336
                g_string_append_c(str, ' ');
337
                parser->space = FALSE;
338
        }
339

    
340
        if (parser->newline && parser->blockquote > 0)
341
                bq_prefix = "  ";
342

    
343
        parser->empty_line = FALSE;
344
        if (ch == '\n') {
345
                parser->newline = TRUE;
346
                if (str->len > 0 && str->str[str->len - 1] == '\n')
347
                        parser->empty_line = TRUE;
348
        } else
349
                parser->newline = FALSE;
350

    
351
        if (bq_prefix) {
352
                gint i;
353
                for (i = 0; i < parser->blockquote; i++)
354
                        g_string_append(str, bq_prefix);
355
        }
356
        g_string_append_c(str, ch);
357
}
358

    
359
static void html_append_str(HTMLParser *parser, const gchar *str, gint len)
360
{
361
        GString *string = parser->str;
362
        const gchar *bq_prefix = NULL;
363

    
364
        if (!parser->pre && parser->space) {
365
                g_string_append_c(string, ' ');
366
                parser->space = FALSE;
367
        }
368

    
369
        if (len == 0) return;
370

    
371
        if (parser->newline && parser->blockquote > 0)
372
                bq_prefix = "  ";
373

    
374
        if (bq_prefix) {
375
                gint i;
376
                for (i = 0; i < parser->blockquote; i++)
377
                        g_string_append(string, bq_prefix);
378
        }
379

    
380
        if (len < 0)
381
                g_string_append(string, str);
382
        else
383
                g_string_append_len(string, str, len);
384

    
385
        parser->empty_line = FALSE;
386
        if (string->len > 0 && string->str[string->len - 1] == '\n') {
387
                parser->newline = TRUE;
388
                if (string->len > 1 && string->str[string->len - 2] == '\n')
389
                        parser->empty_line = TRUE;
390
        } else
391
                parser->newline = FALSE;
392
}
393

    
394
static gchar *html_find_char(HTMLParser *parser, gchar ch)
395
{
396
        gchar *p;
397

    
398
        while ((p = strchr(parser->bufp, ch)) == NULL) {
399
                if (html_read_line(parser) == HTML_EOF)
400
                        return NULL;
401
        }
402

    
403
        return p;
404
}
405

    
406
static gchar *html_find_str(HTMLParser *parser, const gchar *str)
407
{
408
        gchar *p;
409

    
410
        while ((p = strstr(parser->bufp, str)) == NULL) {
411
                if (html_read_line(parser) == HTML_EOF)
412
                        return NULL;
413
        }
414

    
415
        return p;
416
}
417

    
418
static gchar *html_find_str_case(HTMLParser *parser, const gchar *str)
419
{
420
        gchar *p;
421

    
422
        while ((p = strcasestr(parser->bufp, str)) == NULL) {
423
                if (html_read_line(parser) == HTML_EOF)
424
                        return NULL;
425
        }
426

    
427
        return p;
428
}
429

    
430
static HTMLTag *html_get_tag(const gchar *str)
431
{
432
        HTMLTag *tag;
433
        gchar *tmp;
434
        gchar *tmpp;
435

    
436
        g_return_val_if_fail(str != NULL, NULL);
437

    
438
        if (*str == '\0' || *str == '!') return NULL;
439

    
440
        tmp = g_strdup(str);
441

    
442
        tag = g_new0(HTMLTag, 1);
443

    
444
        for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++) {
445
                if (tmpp > tmp && *tmpp == '/') {
446
                        *tmpp = '\0';
447
                        break;
448
                }
449
        }
450

    
451
        if (*tmpp == '\0') {
452
                g_strdown(tmp);
453
                tag->name = tmp;
454
                return tag;
455
        } else {
456
                *tmpp++ = '\0';
457
                g_strdown(tmp);
458
                tag->name = g_strdup(tmp);
459
        }
460

    
461
        while (*tmpp != '\0') {
462
                HTMLAttr *attr;
463
                gchar *attr_name;
464
                gchar *attr_value;
465
                gchar *p;
466
                gchar quote;
467

    
468
                while (g_ascii_isspace(*tmpp)) tmpp++;
469
                if (tmpp > tmp && *tmpp == '/')
470
                        break;
471
                attr_name = tmpp;
472

    
473
                while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) &&
474
                       *tmpp != '=')
475
                        tmpp++;
476
                if (*tmpp != '\0' && *tmpp != '=') {
477
                        *tmpp++ = '\0';
478
                        while (g_ascii_isspace(*tmpp)) tmpp++;
479
                }
480

    
481
                if (*tmpp == '=') {
482
                        *tmpp++ = '\0';
483
                        while (g_ascii_isspace(*tmpp)) tmpp++;
484

    
485
                        if (*tmpp == '"' || *tmpp == '\'') {
486
                                /* name="value" */
487
                                quote = *tmpp;
488
                                tmpp++;
489
                                attr_value = tmpp;
490
                                if ((p = strchr(attr_value, quote)) == NULL) {
491
                                        g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
492
                                        break;
493
                                }
494
                                tmpp = p;
495
                                *tmpp++ = '\0';
496
                                while (g_ascii_isspace(*tmpp)) tmpp++;
497
                        } else {
498
                                /* name=value */
499
                                attr_value = tmpp;
500
                                while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++;
501
                                if (*tmpp != '\0')
502
                                        *tmpp++ = '\0';
503
                        }
504
                } else
505
                        attr_value = "";
506

    
507
                g_strchomp(attr_name);
508
                g_strdown(attr_name);
509
                attr = g_new(HTMLAttr, 1);
510
                attr->name = g_strdup(attr_name);
511
                attr->value = g_strdup(attr_value);
512
                tag->attr = g_list_append(tag->attr, attr);
513
        }
514

    
515
        g_free(tmp);
516

    
517
        return tag;
518
}
519

    
520
static void html_free_tag(HTMLTag *tag)
521
{
522
        if (!tag) return;
523

    
524
        g_free(tag->name);
525
        while (tag->attr != NULL) {
526
                HTMLAttr *attr = (HTMLAttr *)tag->attr->data;
527
                g_free(attr->name);
528
                g_free(attr->value);
529
                g_free(attr);
530
                tag->attr = g_list_remove(tag->attr, tag->attr->data);
531
        }
532
        g_free(tag);
533
}
534

    
535
static HTMLState html_parse_tag(HTMLParser *parser)
536
{
537
        gchar buf[HTMLBUFSIZE];
538
        HTMLTag *tag;
539

    
540
        html_get_parenthesis(parser, buf, sizeof(buf));
541

    
542
        tag = html_get_tag(buf);
543

    
544
        parser->state = HTML_UNKNOWN;
545
        if (!tag) return HTML_UNKNOWN;
546

    
547
        if (!strcmp(tag->name, "br")) {
548
                parser->space = FALSE;
549
                html_append_char(parser, '\n');
550
                parser->state = HTML_BR;
551
        } else if (!strcmp(tag->name, "a")) {
552
                GList *cur;
553

    
554
                for (cur = tag->attr; cur != NULL; cur = cur->next) {
555
                        HTMLAttr *attr = (HTMLAttr *)cur->data;
556

    
557
                        if (attr && !strcmp(attr->name, "href")) {
558
                                g_free(parser->href);
559
                                parser->href = html_unescape_str(parser, attr->value);
560
                                parser->state = HTML_HREF;
561
                                break;
562
                        }
563
                }
564
        } else if (!strcmp(tag->name, "/a")) {
565
                g_free(parser->href);
566
                parser->href = NULL;
567
                parser->state = HTML_NORMAL;
568
        } else if (!strcmp(tag->name, "p")) {
569
                parser->space = FALSE;
570
                if (!parser->empty_line) {
571
                        parser->space = FALSE;
572
                        if (!parser->newline) html_append_char(parser, '\n');
573
                        html_append_char(parser, '\n');
574
                }
575
                parser->state = HTML_PAR;
576
        } else if (!strcmp(tag->name, "pre")) {
577
                parser->pre = TRUE;
578
                parser->state = HTML_PRE;
579
        } else if (!strcmp(tag->name, "/pre")) {
580
                parser->pre = FALSE;
581
                parser->state = HTML_NORMAL;
582
        } else if (!strcmp(tag->name, "blockquote")) {
583
                parser->blockquote++;
584
                parser->state = HTML_BLOCKQUOTE;
585
        } else if (!strcmp(tag->name, "/blockquote")) {
586
                parser->blockquote--;
587
                if (parser->blockquote < 0)
588
                        parser->blockquote = 0;
589
                parser->state = HTML_NORMAL;
590
        } else if (!strcmp(tag->name, "hr")) {
591
                if (!parser->newline) {
592
                        parser->space = FALSE;
593
                        html_append_char(parser, '\n');
594
                }
595
                html_append_str(parser, HR_STR "\n", -1);
596
                parser->state = HTML_HR;
597
        } else if (!strcmp(tag->name, "div")    ||
598
                   !strcmp(tag->name, "ul")     ||
599
                   !strcmp(tag->name, "li")     ||
600
                   !strcmp(tag->name, "table")  ||
601
                   !strcmp(tag->name, "tr")     ||
602
                   (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1]))) {
603
                if (!parser->newline) {
604
                        parser->space = FALSE;
605
                        html_append_char(parser, '\n');
606
                }
607
                parser->state = HTML_NORMAL;
608
        } else if (!strcmp(tag->name, "/table") ||
609
                   (tag->name[0] == '/' &&
610
                    tag->name[1] == 'h' &&
611
                    g_ascii_isdigit(tag->name[1]))) {
612
                if (!parser->empty_line) {
613
                        parser->space = FALSE;
614
                        if (!parser->newline) html_append_char(parser, '\n');
615
                        html_append_char(parser, '\n');
616
                }
617
                parser->state = HTML_NORMAL;
618
        } else if (!strcmp(tag->name, "/div")   ||
619
                   !strcmp(tag->name, "/ul")    ||
620
                   !strcmp(tag->name, "/li")) {
621
                if (!parser->newline) {
622
                        parser->space = FALSE;
623
                        html_append_char(parser, '\n');
624
                }
625
                parser->state = HTML_NORMAL;
626
        }
627

    
628
        html_free_tag(tag);
629

    
630
        return parser->state;
631
}
632

    
633
static void html_parse_special(HTMLParser *parser)
634
{
635
        gchar symbol_name[9];
636
        gint n;
637
        const gchar *val;
638

    
639
        parser->state = HTML_UNKNOWN;
640
        g_return_if_fail(*parser->bufp == '&');
641

    
642
        /* &foo; */
643
        for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
644
                ;
645
        if (n > 7 || parser->bufp[n] != ';') {
646
                /* output literal `&' */
647
                html_append_char(parser, *parser->bufp++);
648
                parser->state = HTML_NORMAL;
649
                return;
650
        }
651
        strncpy2(symbol_name, parser->bufp, n + 2);
652
        parser->bufp += n + 1;
653

    
654
        if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
655
            != NULL) {
656
                html_append_str(parser, val, -1);
657
                parser->state = HTML_NORMAL;
658
                return;
659
        } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) {
660
                gint ch;
661

    
662
                ch = atoi(symbol_name + 2);
663
                if (ch < 128 && g_ascii_isprint(ch)) {
664
                        html_append_char(parser, ch);
665
                        parser->state = HTML_NORMAL;
666
                        return;
667
                } else {
668
                        /* ISO 10646 to UTF-8 */
669
                        gchar buf[6];
670
                        gint len;
671

    
672
                        len = g_unichar_to_utf8((gunichar)ch, buf);
673
                        if (len > 0) {
674
                                html_append_str(parser, buf, len);
675
                                parser->state = HTML_NORMAL;
676
                                return;
677
                        }
678
                }
679
        }
680

    
681
        html_append_str(parser, symbol_name, -1);
682
}
683

    
684
static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len)
685
{
686
        gchar *p;
687

    
688
        buf[0] = '\0';
689
        g_return_if_fail(*parser->bufp == '<');
690

    
691
        /* ignore comment / CSS / script stuff */
692
        if (!strncmp(parser->bufp, "<!--", 4)) {
693
                parser->bufp += 4;
694
                if ((p = html_find_str(parser, "-->")) != NULL)
695
                        parser->bufp = p + 3;
696
                return;
697
        }
698
        if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) {
699
                parser->bufp += 6;
700
                if ((p = html_find_str_case(parser, "</style")) != NULL) {
701
                        parser->bufp = p + 7;
702
                        if ((p = html_find_char(parser, '>')) != NULL)
703
                                parser->bufp = p + 1;
704
                }
705
                return;
706
        }
707
        if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) {
708
                parser->bufp += 7;
709
                if ((p = html_find_str_case(parser, "</script")) != NULL) {
710
                        parser->bufp = p + 8;
711
                        if ((p = html_find_char(parser, '>')) != NULL)
712
                                parser->bufp = p + 1;
713
                }
714
                return;
715
        }
716

    
717
        parser->bufp++;
718
        if ((p = html_find_char(parser, '>')) == NULL)
719
                return;
720

    
721
        strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
722
        g_strstrip(buf);
723
        parser->bufp = p + 1;
724
}
725

    
726
static gchar *html_unescape_str(HTMLParser *parser, const gchar *str)
727
{
728
        const gchar *p = str;
729
        gchar symbol_name[9];
730
        gint n;
731
        const gchar *val;
732
        gchar *unescape_str;
733
        gchar *up;
734

    
735
        if (!str)
736
                return NULL;
737

    
738
        up = unescape_str = g_malloc(strlen(str) + 1);
739

    
740
        while (*p != '\0') {
741
                switch (*p) {
742
                case '&':
743
                        for (n = 0; p[n] != '\0' && p[n] != ';'; n++)
744
                                ;
745
                        if (n > 7 || p[n] != ';') {
746
                                *up++ = *p++;
747
                                break;
748
                        }
749
                        strncpy2(symbol_name, p, n + 2);
750
                        p += n + 1;
751

    
752
                        if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name)) != NULL) {
753
                                gint len = strlen(val);
754
                                if (len <= n + 1) {
755
                                        strcpy(up, val);
756
                                        up += len;
757
                                } else {
758
                                        strcpy(up, symbol_name);
759
                                        up += n + 1;
760
                                }
761
                        } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) {
762
                                gint ch;
763

    
764
                                ch = atoi(symbol_name + 2);
765
                                if (ch < 128 && g_ascii_isprint(ch)) {
766
                                        *up++ = ch;
767
                                } else {
768
                                        /* ISO 10646 to UTF-8 */
769
                                        gchar buf[6];
770
                                        gint len;
771

    
772
                                        len = g_unichar_to_utf8((gunichar)ch, buf);
773
                                        if (len > 0 && len <= n + 1) {
774
                                                memcpy(up, buf, len);
775
                                                up += len;
776
                                        } else {
777
                                                strcpy(up, symbol_name);
778
                                                up += n + 1;
779
                                        }
780
                                }
781
                        }
782

    
783
                        break;
784
                default:
785
                        *up++ = *p++;
786
                }
787
        }
788

    
789
        *up = '\0';
790
        return unescape_str;
791
}