root / libsylph / html.c @ aebfd4cc
History | View | Annotate | Download (15.1 kB)
| 1 | /*
|
|---|---|
| 2 | * LibSylph -- E-Mail client library |
| 3 | * Copyright (C) 1999-2011 Hiroyuki Yamamoto |
| 4 | */ |
| 5 | |
| 6 | #include <glib.h> |
| 7 | #include <stdio.h> |
| 8 | #include <string.h> |
| 9 | #include <ctype.h> |
| 10 | |
| 11 | #include "html.h" |
| 12 | #include "codeconv.h" |
| 13 | #include "utils.h" |
| 14 | |
| 15 | #define HTMLBUFSIZE 8192 |
| 16 | #define HR_STR "------------------------------------------------" |
| 17 | |
| 18 | typedef struct _HTMLSymbol HTMLSymbol; |
| 19 | |
| 20 | struct _HTMLSymbol
|
| 21 | {
|
| 22 | gchar *const key;
|
| 23 | gchar *const val;
|
| 24 | }; |
| 25 | |
| 26 | static HTMLSymbol symbol_list[] = {
|
| 27 | {"<" , "<"},
|
| 28 | {">" , ">"},
|
| 29 | {"&" , "&"},
|
| 30 | {""" , "\""}
|
| 31 | }; |
| 32 | |
| 33 | /*   - ÿ */
|
| 34 | static HTMLSymbol latin_symbol_list[] = {
|
| 35 | {" " , " "},
|
| 36 | /* {" " , "\302\240"}, */
|
| 37 | {"¡" , "\302\241"},
|
| 38 | {"¢" , "\302\242"},
|
| 39 | {"£" , "\302\243"},
|
| 40 | {"¤", "\302\244"},
|
| 41 | {"¥" , "\302\245"},
|
| 42 | {"¦", "\302\246"},
|
| 43 | {"§" , "\302\247"},
|
| 44 | {"¨" , "\302\250"},
|
| 45 | {"©" , "\302\251"},
|
| 46 | {"ª" , "\302\252"},
|
| 47 | {"«" , "\302\253"},
|
| 48 | {"¬" , "\302\254"},
|
| 49 | {"­" , "\302\255"},
|
| 50 | {"®" , "\302\256"},
|
| 51 | {"¯" , "\302\257"},
|
| 52 | {"°" , "\302\260"},
|
| 53 | {"&plusm;" , "\302\261"},
|
| 54 | {"²" , "\302\262"},
|
| 55 | {"³" , "\302\263"},
|
| 56 | {"´" , "\302\264"},
|
| 57 | {"µ" , "\302\265"},
|
| 58 | {"¶" , "\302\266"},
|
| 59 | {"·", "\302\267"},
|
| 60 | {"¸" , "\302\270"},
|
| 61 | {"¹" , "\302\271"},
|
| 62 | {"º" , "\302\272"},
|
| 63 | {"»" , "\302\273"},
|
| 64 | {"¼", "\302\274"},
|
| 65 | {"½", "\302\275"},
|
| 66 | {"¾", "\302\276"},
|
| 67 | {"¿", "\302\277"},
|
| 68 | |
| 69 | {"À", "\303\200"},
|
| 70 | {"Á", "\303\201"},
|
| 71 | {"Â" , "\303\202"},
|
| 72 | {"Ã", "\303\203"},
|
| 73 | {"Ä" , "\303\204"},
|
| 74 | {"Å" , "\303\205"},
|
| 75 | {"Æ" , "\303\206"},
|
| 76 | {"Ç", "\303\207"},
|
| 77 | {"È", "\303\210"},
|
| 78 | {"É", "\303\211"},
|
| 79 | {"Ê" , "\303\212"},
|
| 80 | {"Ë" , "\303\213"},
|
| 81 | {"Ì", "\303\214"},
|
| 82 | {"Í", "\303\215"},
|
| 83 | {"Î" , "\303\216"},
|
| 84 | {"Ï" , "\303\217"},
|
| 85 | {"Ð" , "\303\220"},
|
| 86 | {"Ñ", "\303\221"},
|
| 87 | {"Ò", "\303\222"},
|
| 88 | {"Ó", "\303\223"},
|
| 89 | {"Ô" , "\303\224"},
|
| 90 | {"Õ", "\303\225"},
|
| 91 | {"Ö" , "\303\226"},
|
| 92 | {"×" , "\303\227"},
|
| 93 | {"Ø", "\303\230"},
|
| 94 | {"Ù", "\303\231"},
|
| 95 | {"Ú", "\303\232"},
|
| 96 | {"Û" , "\303\233"},
|
| 97 | {"Ü" , "\303\234"},
|
| 98 | {"Ý", "\303\235"},
|
| 99 | {"Þ" , "\303\236"},
|
| 100 | {"ß" , "\303\237"},
|
| 101 | {"à", "\303\240"},
|
| 102 | {"á", "\303\241"},
|
| 103 | {"â" , "\303\242"},
|
| 104 | {"ã", "\303\243"},
|
| 105 | {"ä" , "\303\244"},
|
| 106 | {"å" , "\303\245"},
|
| 107 | {"æ" , "\303\246"},
|
| 108 | {"ç", "\303\247"},
|
| 109 | {"è", "\303\250"},
|
| 110 | {"é", "\303\251"},
|
| 111 | {"ê" , "\303\252"},
|
| 112 | {"ë" , "\303\253"},
|
| 113 | {"ì", "\303\254"},
|
| 114 | {"í", "\303\255"},
|
| 115 | {"î" , "\303\256"},
|
| 116 | {"ï" , "\303\257"},
|
| 117 | {"ð" , "\303\260"},
|
| 118 | {"ñ", "\303\261"},
|
| 119 | {"ò", "\303\262"},
|
| 120 | {"ó", "\303\263"},
|
| 121 | {"ô" , "\303\264"},
|
| 122 | {"õ", "\303\265"},
|
| 123 | {"ö" , "\303\266"},
|
| 124 | {"÷", "\303\267"},
|
| 125 | {"ø", "\303\270"},
|
| 126 | {"ù", "\303\271"},
|
| 127 | {"ú", "\303\272"},
|
| 128 | {"û" , "\303\273"},
|
| 129 | {"ü" , "\303\274"},
|
| 130 | {"ý", "\303\275"},
|
| 131 | {"þ" , "\303\276"},
|
| 132 | {"ÿ" , "\303\277"}
|
| 133 | }; |
| 134 | |
| 135 | static HTMLSymbol other_symbol_list[] = {
|
| 136 | /* Non-standard? */
|
| 137 | {"…" , "..."},
|
| 138 | {"’" , "'"},
|
| 139 | {"–" , "-"},
|
| 140 | {"™" , "\xe2\x84\xa2"},
|
| 141 | {"œ" , "\xc5\x93"},
|
| 142 | |
| 143 | /* Symbolic characters */
|
| 144 | {"™" , "\xe2\x84\xa2"},
|
| 145 | |
| 146 | /* Latin extended */
|
| 147 | {"Œ" , "\xc5\x92"},
|
| 148 | {"œ" , "\xc5\x93"},
|
| 149 | {"Š", "\xc5\xa0"},
|
| 150 | {"š", "\xc5\xa1"},
|
| 151 | {"Ÿ" , "\xc5\xb8"},
|
| 152 | {"ˆ" , "\xcb\x86"},
|
| 153 | {"˜" , "\xcb\x9c"},
|
| 154 | {"ƒ" , "\xc6\x92"},
|
| 155 | }; |
| 156 | |
| 157 | static GHashTable *default_symbol_table;
|
| 158 | |
| 159 | static HTMLState html_read_line (HTMLParser *parser);
|
| 160 | |
| 161 | static void html_append_char (HTMLParser *parser, |
| 162 | gchar ch); |
| 163 | static void html_append_str (HTMLParser *parser, |
| 164 | const gchar *str,
|
| 165 | gint len); |
| 166 | |
| 167 | static gchar *html_find_char (HTMLParser *parser,
|
| 168 | gchar ch); |
| 169 | static gchar *html_find_str (HTMLParser *parser,
|
| 170 | const gchar *str);
|
| 171 | static gchar *html_find_str_case (HTMLParser *parser,
|
| 172 | const gchar *str);
|
| 173 | |
| 174 | static HTMLState html_parse_tag (HTMLParser *parser);
|
| 175 | static void html_parse_special (HTMLParser *parser); |
| 176 | static void html_get_parenthesis (HTMLParser *parser, |
| 177 | gchar *buf, |
| 178 | gint len); |
| 179 | |
| 180 | |
| 181 | HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv) |
| 182 | {
|
| 183 | HTMLParser *parser; |
| 184 | |
| 185 | g_return_val_if_fail(fp != NULL, NULL); |
| 186 | g_return_val_if_fail(conv != NULL, NULL); |
| 187 | |
| 188 | parser = g_new0(HTMLParser, 1);
|
| 189 | parser->fp = fp; |
| 190 | parser->conv = conv; |
| 191 | parser->str = g_string_new(NULL);
|
| 192 | parser->buf = g_string_new(NULL);
|
| 193 | parser->bufp = parser->buf->str; |
| 194 | parser->state = HTML_NORMAL; |
| 195 | parser->href = NULL;
|
| 196 | parser->newline = TRUE; |
| 197 | parser->empty_line = TRUE; |
| 198 | parser->space = FALSE; |
| 199 | parser->pre = FALSE; |
| 200 | |
| 201 | #define SYMBOL_TABLE_ADD(table, list) \
|
| 202 | { \
|
| 203 | gint i; \ |
| 204 | \ |
| 205 | for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \ |
| 206 | g_hash_table_insert(table, list[i].key, list[i].val); \ |
| 207 | } |
| 208 | |
| 209 | if (!default_symbol_table) {
|
| 210 | default_symbol_table = |
| 211 | g_hash_table_new(g_str_hash, g_str_equal); |
| 212 | SYMBOL_TABLE_ADD(default_symbol_table, symbol_list); |
| 213 | SYMBOL_TABLE_ADD(default_symbol_table, latin_symbol_list); |
| 214 | SYMBOL_TABLE_ADD(default_symbol_table, other_symbol_list); |
| 215 | } |
| 216 | |
| 217 | #undef SYMBOL_TABLE_ADD
|
| 218 | |
| 219 | parser->symbol_table = default_symbol_table; |
| 220 | |
| 221 | return parser;
|
| 222 | } |
| 223 | |
| 224 | void html_parser_destroy(HTMLParser *parser)
|
| 225 | {
|
| 226 | g_string_free(parser->str, TRUE); |
| 227 | g_string_free(parser->buf, TRUE); |
| 228 | g_free(parser->href); |
| 229 | g_free(parser); |
| 230 | } |
| 231 | |
| 232 | const gchar *html_parse(HTMLParser *parser)
|
| 233 | {
|
| 234 | parser->state = HTML_NORMAL; |
| 235 | g_string_truncate(parser->str, 0);
|
| 236 | |
| 237 | if (*parser->bufp == '\0') { |
| 238 | g_string_truncate(parser->buf, 0);
|
| 239 | parser->bufp = parser->buf->str; |
| 240 | if (html_read_line(parser) == HTML_EOF)
|
| 241 | return NULL; |
| 242 | } |
| 243 | |
| 244 | while (*parser->bufp != '\0') { |
| 245 | switch (*parser->bufp) {
|
| 246 | case '<': |
| 247 | if (parser->str->len == 0) |
| 248 | html_parse_tag(parser); |
| 249 | else
|
| 250 | return parser->str->str;
|
| 251 | break;
|
| 252 | case '&': |
| 253 | html_parse_special(parser); |
| 254 | break;
|
| 255 | case ' ': |
| 256 | case '\t': |
| 257 | case '\r': |
| 258 | case '\n': |
| 259 | if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n') |
| 260 | parser->bufp++; |
| 261 | |
| 262 | if (!parser->pre) {
|
| 263 | if (!parser->newline)
|
| 264 | parser->space = TRUE; |
| 265 | |
| 266 | parser->bufp++; |
| 267 | break;
|
| 268 | } |
| 269 | /* fallthrough */
|
| 270 | default:
|
| 271 | html_append_char(parser, *parser->bufp++); |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | return parser->str->str;
|
| 276 | } |
| 277 | |
| 278 | static HTMLState html_read_line(HTMLParser *parser)
|
| 279 | {
|
| 280 | gchar buf[HTMLBUFSIZE]; |
| 281 | gchar *conv_str; |
| 282 | gint index; |
| 283 | |
| 284 | if (fgets(buf, sizeof(buf), parser->fp) == NULL) { |
| 285 | parser->state = HTML_EOF; |
| 286 | return HTML_EOF;
|
| 287 | } |
| 288 | |
| 289 | conv_str = conv_convert(parser->conv, buf); |
| 290 | if (!conv_str) {
|
| 291 | index = parser->bufp - parser->buf->str; |
| 292 | |
| 293 | conv_str = conv_utf8todisp(buf, NULL);
|
| 294 | g_string_append(parser->buf, conv_str); |
| 295 | g_free(conv_str); |
| 296 | |
| 297 | parser->bufp = parser->buf->str + index; |
| 298 | |
| 299 | return HTML_CONV_FAILED;
|
| 300 | } |
| 301 | |
| 302 | index = parser->bufp - parser->buf->str; |
| 303 | |
| 304 | g_string_append(parser->buf, conv_str); |
| 305 | g_free(conv_str); |
| 306 | |
| 307 | parser->bufp = parser->buf->str + index; |
| 308 | |
| 309 | return HTML_NORMAL;
|
| 310 | } |
| 311 | |
| 312 | static void html_append_char(HTMLParser *parser, gchar ch) |
| 313 | {
|
| 314 | GString *str = parser->str; |
| 315 | |
| 316 | if (!parser->pre && parser->space) {
|
| 317 | g_string_append_c(str, ' ');
|
| 318 | parser->space = FALSE; |
| 319 | } |
| 320 | |
| 321 | g_string_append_c(str, ch); |
| 322 | |
| 323 | parser->empty_line = FALSE; |
| 324 | if (ch == '\n') { |
| 325 | parser->newline = TRUE; |
| 326 | if (str->len > 1 && str->str[str->len - 2] == '\n') |
| 327 | parser->empty_line = TRUE; |
| 328 | } else
|
| 329 | parser->newline = FALSE; |
| 330 | } |
| 331 | |
| 332 | static void html_append_str(HTMLParser *parser, const gchar *str, gint len) |
| 333 | {
|
| 334 | GString *string = parser->str; |
| 335 | |
| 336 | if (!parser->pre && parser->space) {
|
| 337 | g_string_append_c(string, ' ');
|
| 338 | parser->space = FALSE; |
| 339 | } |
| 340 | |
| 341 | if (len == 0) return; |
| 342 | if (len < 0) |
| 343 | g_string_append(string, str); |
| 344 | else
|
| 345 | g_string_append_len(string, str, len); |
| 346 | |
| 347 | parser->empty_line = FALSE; |
| 348 | if (string->len > 0 && string->str[string->len - 1] == '\n') { |
| 349 | parser->newline = TRUE; |
| 350 | if (string->len > 1 && string->str[string->len - 2] == '\n') |
| 351 | parser->empty_line = TRUE; |
| 352 | } else
|
| 353 | parser->newline = FALSE; |
| 354 | } |
| 355 | |
| 356 | static gchar *html_find_char(HTMLParser *parser, gchar ch)
|
| 357 | {
|
| 358 | gchar *p; |
| 359 | |
| 360 | while ((p = strchr(parser->bufp, ch)) == NULL) { |
| 361 | if (html_read_line(parser) == HTML_EOF)
|
| 362 | return NULL; |
| 363 | } |
| 364 | |
| 365 | return p;
|
| 366 | } |
| 367 | |
| 368 | static gchar *html_find_str(HTMLParser *parser, const gchar *str) |
| 369 | {
|
| 370 | gchar *p; |
| 371 | |
| 372 | while ((p = strstr(parser->bufp, str)) == NULL) { |
| 373 | if (html_read_line(parser) == HTML_EOF)
|
| 374 | return NULL; |
| 375 | } |
| 376 | |
| 377 | return p;
|
| 378 | } |
| 379 | |
| 380 | static gchar *html_find_str_case(HTMLParser *parser, const gchar *str) |
| 381 | {
|
| 382 | gchar *p; |
| 383 | |
| 384 | while ((p = strcasestr(parser->bufp, str)) == NULL) { |
| 385 | if (html_read_line(parser) == HTML_EOF)
|
| 386 | return NULL; |
| 387 | } |
| 388 | |
| 389 | return p;
|
| 390 | } |
| 391 | |
| 392 | static HTMLTag *html_get_tag(const gchar *str) |
| 393 | {
|
| 394 | HTMLTag *tag; |
| 395 | gchar *tmp; |
| 396 | gchar *tmpp; |
| 397 | |
| 398 | g_return_val_if_fail(str != NULL, NULL); |
| 399 | |
| 400 | if (*str == '\0' || *str == '!') return NULL; |
| 401 | |
| 402 | tmp = g_strdup(str); |
| 403 | |
| 404 | tag = g_new0(HTMLTag, 1);
|
| 405 | |
| 406 | for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++) { |
| 407 | if (tmpp > tmp && *tmpp == '/') { |
| 408 | *tmpp = '\0';
|
| 409 | break;
|
| 410 | } |
| 411 | } |
| 412 | |
| 413 | if (*tmpp == '\0') { |
| 414 | g_strdown(tmp); |
| 415 | tag->name = tmp; |
| 416 | return tag;
|
| 417 | } else {
|
| 418 | *tmpp++ = '\0';
|
| 419 | g_strdown(tmp); |
| 420 | tag->name = g_strdup(tmp); |
| 421 | } |
| 422 | |
| 423 | while (*tmpp != '\0') { |
| 424 | HTMLAttr *attr; |
| 425 | gchar *attr_name; |
| 426 | gchar *attr_value; |
| 427 | gchar *p; |
| 428 | gchar quote; |
| 429 | |
| 430 | while (g_ascii_isspace(*tmpp)) tmpp++;
|
| 431 | if (tmpp > tmp && *tmpp == '/') |
| 432 | break;
|
| 433 | attr_name = tmpp; |
| 434 | |
| 435 | while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) && |
| 436 | *tmpp != '=')
|
| 437 | tmpp++; |
| 438 | if (*tmpp != '\0' && *tmpp != '=') { |
| 439 | *tmpp++ = '\0';
|
| 440 | while (g_ascii_isspace(*tmpp)) tmpp++;
|
| 441 | } |
| 442 | |
| 443 | if (*tmpp == '=') { |
| 444 | *tmpp++ = '\0';
|
| 445 | while (g_ascii_isspace(*tmpp)) tmpp++;
|
| 446 | |
| 447 | if (*tmpp == '"' || *tmpp == '\'') { |
| 448 | /* name="value" */
|
| 449 | quote = *tmpp; |
| 450 | tmpp++; |
| 451 | attr_value = tmpp; |
| 452 | if ((p = strchr(attr_value, quote)) == NULL) { |
| 453 | g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
|
| 454 | break;
|
| 455 | } |
| 456 | tmpp = p; |
| 457 | *tmpp++ = '\0';
|
| 458 | while (g_ascii_isspace(*tmpp)) tmpp++;
|
| 459 | } else {
|
| 460 | /* name=value */
|
| 461 | attr_value = tmpp; |
| 462 | while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++; |
| 463 | if (*tmpp != '\0') |
| 464 | *tmpp++ = '\0';
|
| 465 | } |
| 466 | } else
|
| 467 | attr_value = "";
|
| 468 | |
| 469 | g_strchomp(attr_name); |
| 470 | g_strdown(attr_name); |
| 471 | attr = g_new(HTMLAttr, 1);
|
| 472 | attr->name = g_strdup(attr_name); |
| 473 | attr->value = g_strdup(attr_value); |
| 474 | tag->attr = g_list_append(tag->attr, attr); |
| 475 | } |
| 476 | |
| 477 | g_free(tmp); |
| 478 | |
| 479 | return tag;
|
| 480 | } |
| 481 | |
| 482 | static void html_free_tag(HTMLTag *tag) |
| 483 | {
|
| 484 | if (!tag) return; |
| 485 | |
| 486 | g_free(tag->name); |
| 487 | while (tag->attr != NULL) { |
| 488 | HTMLAttr *attr = (HTMLAttr *)tag->attr->data; |
| 489 | g_free(attr->name); |
| 490 | g_free(attr->value); |
| 491 | g_free(attr); |
| 492 | tag->attr = g_list_remove(tag->attr, tag->attr->data); |
| 493 | } |
| 494 | g_free(tag); |
| 495 | } |
| 496 | |
| 497 | static HTMLState html_parse_tag(HTMLParser *parser)
|
| 498 | {
|
| 499 | gchar buf[HTMLBUFSIZE]; |
| 500 | HTMLTag *tag; |
| 501 | |
| 502 | html_get_parenthesis(parser, buf, sizeof(buf));
|
| 503 | |
| 504 | tag = html_get_tag(buf); |
| 505 | |
| 506 | parser->state = HTML_UNKNOWN; |
| 507 | if (!tag) return HTML_UNKNOWN; |
| 508 | |
| 509 | if (!strcmp(tag->name, "br")) { |
| 510 | parser->space = FALSE; |
| 511 | html_append_char(parser, '\n');
|
| 512 | parser->state = HTML_BR; |
| 513 | } else if (!strcmp(tag->name, "a")) { |
| 514 | GList *cur; |
| 515 | |
| 516 | for (cur = tag->attr; cur != NULL; cur = cur->next) { |
| 517 | HTMLAttr *attr = (HTMLAttr *)cur->data; |
| 518 | |
| 519 | if (attr && !strcmp(attr->name, "href")) { |
| 520 | g_free(parser->href); |
| 521 | parser->href = g_strdup(attr->value); |
| 522 | parser->state = HTML_HREF; |
| 523 | break;
|
| 524 | } |
| 525 | } |
| 526 | } else if (!strcmp(tag->name, "/a")) { |
| 527 | g_free(parser->href); |
| 528 | parser->href = NULL;
|
| 529 | parser->state = HTML_NORMAL; |
| 530 | } else if (!strcmp(tag->name, "p")) { |
| 531 | parser->space = FALSE; |
| 532 | if (!parser->empty_line) {
|
| 533 | parser->space = FALSE; |
| 534 | if (!parser->newline) html_append_char(parser, '\n'); |
| 535 | html_append_char(parser, '\n');
|
| 536 | } |
| 537 | parser->state = HTML_PAR; |
| 538 | } else if (!strcmp(tag->name, "pre")) { |
| 539 | parser->pre = TRUE; |
| 540 | parser->state = HTML_PRE; |
| 541 | } else if (!strcmp(tag->name, "/pre")) { |
| 542 | parser->pre = FALSE; |
| 543 | parser->state = HTML_NORMAL; |
| 544 | } else if (!strcmp(tag->name, "hr")) { |
| 545 | if (!parser->newline) {
|
| 546 | parser->space = FALSE; |
| 547 | html_append_char(parser, '\n');
|
| 548 | } |
| 549 | html_append_str(parser, HR_STR "\n", -1); |
| 550 | parser->state = HTML_HR; |
| 551 | } else if (!strcmp(tag->name, "div") || |
| 552 | !strcmp(tag->name, "ul") ||
|
| 553 | !strcmp(tag->name, "li") ||
|
| 554 | !strcmp(tag->name, "table") ||
|
| 555 | !strcmp(tag->name, "tr") ||
|
| 556 | (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1]))) { |
| 557 | if (!parser->newline) {
|
| 558 | parser->space = FALSE; |
| 559 | html_append_char(parser, '\n');
|
| 560 | } |
| 561 | parser->state = HTML_NORMAL; |
| 562 | } else if (!strcmp(tag->name, "/table") || |
| 563 | (tag->name[0] == '/' && |
| 564 | tag->name[1] == 'h' && |
| 565 | g_ascii_isdigit(tag->name[1]))) {
|
| 566 | if (!parser->empty_line) {
|
| 567 | parser->space = FALSE; |
| 568 | if (!parser->newline) html_append_char(parser, '\n'); |
| 569 | html_append_char(parser, '\n');
|
| 570 | } |
| 571 | parser->state = HTML_NORMAL; |
| 572 | } else if (!strcmp(tag->name, "/div") || |
| 573 | !strcmp(tag->name, "/ul") ||
|
| 574 | !strcmp(tag->name, "/li")) {
|
| 575 | if (!parser->newline) {
|
| 576 | parser->space = FALSE; |
| 577 | html_append_char(parser, '\n');
|
| 578 | } |
| 579 | parser->state = HTML_NORMAL; |
| 580 | } |
| 581 | |
| 582 | html_free_tag(tag); |
| 583 | |
| 584 | return parser->state;
|
| 585 | } |
| 586 | |
| 587 | static void html_parse_special(HTMLParser *parser) |
| 588 | {
|
| 589 | gchar symbol_name[9];
|
| 590 | gint n; |
| 591 | const gchar *val;
|
| 592 | |
| 593 | parser->state = HTML_UNKNOWN; |
| 594 | g_return_if_fail(*parser->bufp == '&');
|
| 595 | |
| 596 | /* &foo; */
|
| 597 | for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++) |
| 598 | ; |
| 599 | if (n > 7 || parser->bufp[n] != ';') { |
| 600 | /* output literal `&' */
|
| 601 | html_append_char(parser, *parser->bufp++); |
| 602 | parser->state = HTML_NORMAL; |
| 603 | return;
|
| 604 | } |
| 605 | strncpy2(symbol_name, parser->bufp, n + 2);
|
| 606 | parser->bufp += n + 1;
|
| 607 | |
| 608 | if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
|
| 609 | != NULL) {
|
| 610 | html_append_str(parser, val, -1);
|
| 611 | parser->state = HTML_NORMAL; |
| 612 | return;
|
| 613 | } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) { |
| 614 | gint ch; |
| 615 | |
| 616 | ch = atoi(symbol_name + 2);
|
| 617 | if (ch < 128 && g_ascii_isprint(ch)) { |
| 618 | html_append_char(parser, ch); |
| 619 | parser->state = HTML_NORMAL; |
| 620 | return;
|
| 621 | } else {
|
| 622 | /* ISO 10646 to UTF-8 */
|
| 623 | gchar buf[6];
|
| 624 | gint len; |
| 625 | |
| 626 | len = g_unichar_to_utf8((gunichar)ch, buf); |
| 627 | if (len > 0) { |
| 628 | html_append_str(parser, buf, len); |
| 629 | parser->state = HTML_NORMAL; |
| 630 | return;
|
| 631 | } |
| 632 | } |
| 633 | } |
| 634 | |
| 635 | html_append_str(parser, symbol_name, -1);
|
| 636 | } |
| 637 | |
| 638 | static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len) |
| 639 | {
|
| 640 | gchar *p; |
| 641 | |
| 642 | buf[0] = '\0'; |
| 643 | g_return_if_fail(*parser->bufp == '<');
|
| 644 | |
| 645 | /* ignore comment / CSS / script stuff */
|
| 646 | if (!strncmp(parser->bufp, "<!--", 4)) { |
| 647 | parser->bufp += 4;
|
| 648 | if ((p = html_find_str(parser, "-->")) != NULL) |
| 649 | parser->bufp = p + 3;
|
| 650 | return;
|
| 651 | } |
| 652 | if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) { |
| 653 | parser->bufp += 6;
|
| 654 | if ((p = html_find_str_case(parser, "</style")) != NULL) { |
| 655 | parser->bufp = p + 7;
|
| 656 | if ((p = html_find_char(parser, '>')) != NULL) |
| 657 | parser->bufp = p + 1;
|
| 658 | } |
| 659 | return;
|
| 660 | } |
| 661 | if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) { |
| 662 | parser->bufp += 7;
|
| 663 | if ((p = html_find_str_case(parser, "</script")) != NULL) { |
| 664 | parser->bufp = p + 8;
|
| 665 | if ((p = html_find_char(parser, '>')) != NULL) |
| 666 | parser->bufp = p + 1;
|
| 667 | } |
| 668 | return;
|
| 669 | } |
| 670 | |
| 671 | parser->bufp++; |
| 672 | if ((p = html_find_char(parser, '>')) == NULL) |
| 673 | return;
|
| 674 | |
| 675 | strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
|
| 676 | g_strstrip(buf); |
| 677 | parser->bufp = p + 1;
|
| 678 | } |