root / libsylph / html.c @ 3070
History | View | Annotate | Download (15.8 kB)
| 1 | /*
|
|---|---|
| 2 | * LibSylph -- E-Mail client library |
| 3 | * Copyright (C) 1999-2011 Hiroyuki Yamamoto |
| 4 | * |
| 5 | * This library is free software; you can redistribute it and/or |
| 6 | * modify it under the terms of the GNU Lesser General Public |
| 7 | * License as published by the Free Software Foundation; either |
| 8 | * version 2.1 of the License, or (at your option) any later version. |
| 9 | * |
| 10 | * This library is distributed in the hope that it will be useful, |
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | * Lesser General Public License for more details. |
| 14 | * |
| 15 | * You should have received a copy of the GNU Lesser General Public |
| 16 | * License along with this library; if not, write to the Free Software |
| 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
| 18 | */ |
| 19 | |
| 20 | #include <glib.h> |
| 21 | #include <stdio.h> |
| 22 | #include <string.h> |
| 23 | #include <ctype.h> |
| 24 | |
| 25 | #include "html.h" |
| 26 | #include "codeconv.h" |
| 27 | #include "utils.h" |
| 28 | |
| 29 | #define HTMLBUFSIZE 8192 |
| 30 | #define HR_STR "------------------------------------------------" |
| 31 | |
| 32 | typedef struct _HTMLSymbol HTMLSymbol; |
| 33 | |
| 34 | struct _HTMLSymbol
|
| 35 | {
|
| 36 | gchar *const key;
|
| 37 | gchar *const val;
|
| 38 | }; |
| 39 | |
| 40 | static HTMLSymbol symbol_list[] = {
|
| 41 | {"<" , "<"},
|
| 42 | {">" , ">"},
|
| 43 | {"&" , "&"},
|
| 44 | {""" , "\""}
|
| 45 | }; |
| 46 | |
| 47 | /*   - ÿ */
|
| 48 | static HTMLSymbol latin_symbol_list[] = {
|
| 49 | {" " , " "},
|
| 50 | /* {" " , "\302\240"}, */
|
| 51 | {"¡" , "\302\241"},
|
| 52 | {"¢" , "\302\242"},
|
| 53 | {"£" , "\302\243"},
|
| 54 | {"¤", "\302\244"},
|
| 55 | {"¥" , "\302\245"},
|
| 56 | {"¦", "\302\246"},
|
| 57 | {"§" , "\302\247"},
|
| 58 | {"¨" , "\302\250"},
|
| 59 | {"©" , "\302\251"},
|
| 60 | {"ª" , "\302\252"},
|
| 61 | {"«" , "\302\253"},
|
| 62 | {"¬" , "\302\254"},
|
| 63 | {"­" , "\302\255"},
|
| 64 | {"®" , "\302\256"},
|
| 65 | {"¯" , "\302\257"},
|
| 66 | {"°" , "\302\260"},
|
| 67 | {"&plusm;" , "\302\261"},
|
| 68 | {"²" , "\302\262"},
|
| 69 | {"³" , "\302\263"},
|
| 70 | {"´" , "\302\264"},
|
| 71 | {"µ" , "\302\265"},
|
| 72 | {"¶" , "\302\266"},
|
| 73 | {"·", "\302\267"},
|
| 74 | {"¸" , "\302\270"},
|
| 75 | {"¹" , "\302\271"},
|
| 76 | {"º" , "\302\272"},
|
| 77 | {"»" , "\302\273"},
|
| 78 | {"¼", "\302\274"},
|
| 79 | {"½", "\302\275"},
|
| 80 | {"¾", "\302\276"},
|
| 81 | {"¿", "\302\277"},
|
| 82 | |
| 83 | {"À", "\303\200"},
|
| 84 | {"Á", "\303\201"},
|
| 85 | {"Â" , "\303\202"},
|
| 86 | {"Ã", "\303\203"},
|
| 87 | {"Ä" , "\303\204"},
|
| 88 | {"Å" , "\303\205"},
|
| 89 | {"Æ" , "\303\206"},
|
| 90 | {"Ç", "\303\207"},
|
| 91 | {"È", "\303\210"},
|
| 92 | {"É", "\303\211"},
|
| 93 | {"Ê" , "\303\212"},
|
| 94 | {"Ë" , "\303\213"},
|
| 95 | {"Ì", "\303\214"},
|
| 96 | {"Í", "\303\215"},
|
| 97 | {"Î" , "\303\216"},
|
| 98 | {"Ï" , "\303\217"},
|
| 99 | {"Ð" , "\303\220"},
|
| 100 | {"Ñ", "\303\221"},
|
| 101 | {"Ò", "\303\222"},
|
| 102 | {"Ó", "\303\223"},
|
| 103 | {"Ô" , "\303\224"},
|
| 104 | {"Õ", "\303\225"},
|
| 105 | {"Ö" , "\303\226"},
|
| 106 | {"×" , "\303\227"},
|
| 107 | {"Ø", "\303\230"},
|
| 108 | {"Ù", "\303\231"},
|
| 109 | {"Ú", "\303\232"},
|
| 110 | {"Û" , "\303\233"},
|
| 111 | {"Ü" , "\303\234"},
|
| 112 | {"Ý", "\303\235"},
|
| 113 | {"Þ" , "\303\236"},
|
| 114 | {"ß" , "\303\237"},
|
| 115 | {"à", "\303\240"},
|
| 116 | {"á", "\303\241"},
|
| 117 | {"â" , "\303\242"},
|
| 118 | {"ã", "\303\243"},
|
| 119 | {"ä" , "\303\244"},
|
| 120 | {"å" , "\303\245"},
|
| 121 | {"æ" , "\303\246"},
|
| 122 | {"ç", "\303\247"},
|
| 123 | {"è", "\303\250"},
|
| 124 | {"é", "\303\251"},
|
| 125 | {"ê" , "\303\252"},
|
| 126 | {"ë" , "\303\253"},
|
| 127 | {"ì", "\303\254"},
|
| 128 | {"í", "\303\255"},
|
| 129 | {"î" , "\303\256"},
|
| 130 | {"ï" , "\303\257"},
|
| 131 | {"ð" , "\303\260"},
|
| 132 | {"ñ", "\303\261"},
|
| 133 | {"ò", "\303\262"},
|
| 134 | {"ó", "\303\263"},
|
| 135 | {"ô" , "\303\264"},
|
| 136 | {"õ", "\303\265"},
|
| 137 | {"ö" , "\303\266"},
|
| 138 | {"÷", "\303\267"},
|
| 139 | {"ø", "\303\270"},
|
| 140 | {"ù", "\303\271"},
|
| 141 | {"ú", "\303\272"},
|
| 142 | {"û" , "\303\273"},
|
| 143 | {"ü" , "\303\274"},
|
| 144 | {"ý", "\303\275"},
|
| 145 | {"þ" , "\303\276"},
|
| 146 | {"ÿ" , "\303\277"}
|
| 147 | }; |
| 148 | |
| 149 | static HTMLSymbol other_symbol_list[] = {
|
| 150 | /* Non-standard? */
|
| 151 | {"…" , "..."},
|
| 152 | {"’" , "'"},
|
| 153 | {"–" , "-"},
|
| 154 | {"™" , "\xe2\x84\xa2"},
|
| 155 | {"œ" , "\xc5\x93"},
|
| 156 | |
| 157 | /* Symbolic characters */
|
| 158 | {"™" , "\xe2\x84\xa2"},
|
| 159 | |
| 160 | /* Latin extended */
|
| 161 | {"Œ" , "\xc5\x92"},
|
| 162 | {"œ" , "\xc5\x93"},
|
| 163 | {"Š", "\xc5\xa0"},
|
| 164 | {"š", "\xc5\xa1"},
|
| 165 | {"Ÿ" , "\xc5\xb8"},
|
| 166 | {"ˆ" , "\xcb\x86"},
|
| 167 | {"˜" , "\xcb\x9c"},
|
| 168 | {"ƒ" , "\xc6\x92"},
|
| 169 | }; |
| 170 | |
| 171 | static GHashTable *default_symbol_table;
|
| 172 | |
| 173 | static HTMLState html_read_line (HTMLParser *parser);
|
| 174 | |
| 175 | static void html_append_char (HTMLParser *parser, |
| 176 | gchar ch); |
| 177 | static void html_append_str (HTMLParser *parser, |
| 178 | const gchar *str,
|
| 179 | gint len); |
| 180 | |
| 181 | static gchar *html_find_char (HTMLParser *parser,
|
| 182 | gchar ch); |
| 183 | static gchar *html_find_str (HTMLParser *parser,
|
| 184 | const gchar *str);
|
| 185 | static gchar *html_find_str_case (HTMLParser *parser,
|
| 186 | const gchar *str);
|
| 187 | |
| 188 | static HTMLState html_parse_tag (HTMLParser *parser);
|
| 189 | static void html_parse_special (HTMLParser *parser); |
| 190 | static void html_get_parenthesis (HTMLParser *parser, |
| 191 | gchar *buf, |
| 192 | gint len); |
| 193 | |
| 194 | |
| 195 | HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv) |
| 196 | {
|
| 197 | HTMLParser *parser; |
| 198 | |
| 199 | g_return_val_if_fail(fp != NULL, NULL); |
| 200 | g_return_val_if_fail(conv != NULL, NULL); |
| 201 | |
| 202 | parser = g_new0(HTMLParser, 1);
|
| 203 | parser->fp = fp; |
| 204 | parser->conv = conv; |
| 205 | parser->str = g_string_new(NULL);
|
| 206 | parser->buf = g_string_new(NULL);
|
| 207 | parser->bufp = parser->buf->str; |
| 208 | parser->state = HTML_NORMAL; |
| 209 | parser->href = NULL;
|
| 210 | parser->newline = TRUE; |
| 211 | parser->empty_line = TRUE; |
| 212 | parser->space = FALSE; |
| 213 | parser->pre = FALSE; |
| 214 | |
| 215 | #define SYMBOL_TABLE_ADD(table, list) \
|
| 216 | { \
|
| 217 | gint i; \ |
| 218 | \ |
| 219 | for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \ |
| 220 | g_hash_table_insert(table, list[i].key, list[i].val); \ |
| 221 | } |
| 222 | |
| 223 | if (!default_symbol_table) {
|
| 224 | default_symbol_table = |
| 225 | g_hash_table_new(g_str_hash, g_str_equal); |
| 226 | SYMBOL_TABLE_ADD(default_symbol_table, symbol_list); |
| 227 | SYMBOL_TABLE_ADD(default_symbol_table, latin_symbol_list); |
| 228 | SYMBOL_TABLE_ADD(default_symbol_table, other_symbol_list); |
| 229 | } |
| 230 | |
| 231 | #undef SYMBOL_TABLE_ADD
|
| 232 | |
| 233 | parser->symbol_table = default_symbol_table; |
| 234 | |
| 235 | return parser;
|
| 236 | } |
| 237 | |
| 238 | void html_parser_destroy(HTMLParser *parser)
|
| 239 | {
|
| 240 | g_string_free(parser->str, TRUE); |
| 241 | g_string_free(parser->buf, TRUE); |
| 242 | g_free(parser->href); |
| 243 | g_free(parser); |
| 244 | } |
| 245 | |
| 246 | const gchar *html_parse(HTMLParser *parser)
|
| 247 | {
|
| 248 | parser->state = HTML_NORMAL; |
| 249 | g_string_truncate(parser->str, 0);
|
| 250 | |
| 251 | if (*parser->bufp == '\0') { |
| 252 | g_string_truncate(parser->buf, 0);
|
| 253 | parser->bufp = parser->buf->str; |
| 254 | if (html_read_line(parser) == HTML_EOF)
|
| 255 | return NULL; |
| 256 | } |
| 257 | |
| 258 | while (*parser->bufp != '\0') { |
| 259 | switch (*parser->bufp) {
|
| 260 | case '<': |
| 261 | if (parser->str->len == 0) |
| 262 | html_parse_tag(parser); |
| 263 | else
|
| 264 | return parser->str->str;
|
| 265 | break;
|
| 266 | case '&': |
| 267 | html_parse_special(parser); |
| 268 | break;
|
| 269 | case ' ': |
| 270 | case '\t': |
| 271 | case '\r': |
| 272 | case '\n': |
| 273 | if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n') |
| 274 | parser->bufp++; |
| 275 | |
| 276 | if (!parser->pre) {
|
| 277 | if (!parser->newline)
|
| 278 | parser->space = TRUE; |
| 279 | |
| 280 | parser->bufp++; |
| 281 | break;
|
| 282 | } |
| 283 | /* fallthrough */
|
| 284 | default:
|
| 285 | html_append_char(parser, *parser->bufp++); |
| 286 | } |
| 287 | } |
| 288 | |
| 289 | return parser->str->str;
|
| 290 | } |
| 291 | |
| 292 | static HTMLState html_read_line(HTMLParser *parser)
|
| 293 | {
|
| 294 | gchar buf[HTMLBUFSIZE]; |
| 295 | gchar *conv_str; |
| 296 | gint index; |
| 297 | |
| 298 | if (fgets(buf, sizeof(buf), parser->fp) == NULL) { |
| 299 | parser->state = HTML_EOF; |
| 300 | return HTML_EOF;
|
| 301 | } |
| 302 | |
| 303 | conv_str = conv_convert(parser->conv, buf); |
| 304 | if (!conv_str) {
|
| 305 | index = parser->bufp - parser->buf->str; |
| 306 | |
| 307 | conv_str = conv_utf8todisp(buf, NULL);
|
| 308 | g_string_append(parser->buf, conv_str); |
| 309 | g_free(conv_str); |
| 310 | |
| 311 | parser->bufp = parser->buf->str + index; |
| 312 | |
| 313 | return HTML_CONV_FAILED;
|
| 314 | } |
| 315 | |
| 316 | index = parser->bufp - parser->buf->str; |
| 317 | |
| 318 | g_string_append(parser->buf, conv_str); |
| 319 | g_free(conv_str); |
| 320 | |
| 321 | parser->bufp = parser->buf->str + index; |
| 322 | |
| 323 | return HTML_NORMAL;
|
| 324 | } |
| 325 | |
| 326 | static void html_append_char(HTMLParser *parser, gchar ch) |
| 327 | {
|
| 328 | GString *str = parser->str; |
| 329 | |
| 330 | if (!parser->pre && parser->space) {
|
| 331 | g_string_append_c(str, ' ');
|
| 332 | parser->space = FALSE; |
| 333 | } |
| 334 | |
| 335 | g_string_append_c(str, ch); |
| 336 | |
| 337 | parser->empty_line = FALSE; |
| 338 | if (ch == '\n') { |
| 339 | parser->newline = TRUE; |
| 340 | if (str->len > 1 && str->str[str->len - 2] == '\n') |
| 341 | parser->empty_line = TRUE; |
| 342 | } else
|
| 343 | parser->newline = FALSE; |
| 344 | } |
| 345 | |
| 346 | static void html_append_str(HTMLParser *parser, const gchar *str, gint len) |
| 347 | {
|
| 348 | GString *string = parser->str; |
| 349 | |
| 350 | if (!parser->pre && parser->space) {
|
| 351 | g_string_append_c(string, ' ');
|
| 352 | parser->space = FALSE; |
| 353 | } |
| 354 | |
| 355 | if (len == 0) return; |
| 356 | if (len < 0) |
| 357 | g_string_append(string, str); |
| 358 | else
|
| 359 | g_string_append_len(string, str, len); |
| 360 | |
| 361 | parser->empty_line = FALSE; |
| 362 | if (string->len > 0 && string->str[string->len - 1] == '\n') { |
| 363 | parser->newline = TRUE; |
| 364 | if (string->len > 1 && string->str[string->len - 2] == '\n') |
| 365 | parser->empty_line = TRUE; |
| 366 | } else
|
| 367 | parser->newline = FALSE; |
| 368 | } |
| 369 | |
| 370 | static gchar *html_find_char(HTMLParser *parser, gchar ch)
|
| 371 | {
|
| 372 | gchar *p; |
| 373 | |
| 374 | while ((p = strchr(parser->bufp, ch)) == NULL) { |
| 375 | if (html_read_line(parser) == HTML_EOF)
|
| 376 | return NULL; |
| 377 | } |
| 378 | |
| 379 | return p;
|
| 380 | } |
| 381 | |
| 382 | static gchar *html_find_str(HTMLParser *parser, const gchar *str) |
| 383 | {
|
| 384 | gchar *p; |
| 385 | |
| 386 | while ((p = strstr(parser->bufp, str)) == NULL) { |
| 387 | if (html_read_line(parser) == HTML_EOF)
|
| 388 | return NULL; |
| 389 | } |
| 390 | |
| 391 | return p;
|
| 392 | } |
| 393 | |
| 394 | static gchar *html_find_str_case(HTMLParser *parser, const gchar *str) |
| 395 | {
|
| 396 | gchar *p; |
| 397 | |
| 398 | while ((p = strcasestr(parser->bufp, str)) == NULL) { |
| 399 | if (html_read_line(parser) == HTML_EOF)
|
| 400 | return NULL; |
| 401 | } |
| 402 | |
| 403 | return p;
|
| 404 | } |
| 405 | |
| 406 | static HTMLTag *html_get_tag(const gchar *str) |
| 407 | {
|
| 408 | HTMLTag *tag; |
| 409 | gchar *tmp; |
| 410 | gchar *tmpp; |
| 411 | |
| 412 | g_return_val_if_fail(str != NULL, NULL); |
| 413 | |
| 414 | if (*str == '\0' || *str == '!') return NULL; |
| 415 | |
| 416 | tmp = g_strdup(str); |
| 417 | |
| 418 | tag = g_new0(HTMLTag, 1);
|
| 419 | |
| 420 | for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++) { |
| 421 | if (tmpp > tmp && *tmpp == '/') { |
| 422 | *tmpp = '\0';
|
| 423 | break;
|
| 424 | } |
| 425 | } |
| 426 | |
| 427 | if (*tmpp == '\0') { |
| 428 | g_strdown(tmp); |
| 429 | tag->name = tmp; |
| 430 | return tag;
|
| 431 | } else {
|
| 432 | *tmpp++ = '\0';
|
| 433 | g_strdown(tmp); |
| 434 | tag->name = g_strdup(tmp); |
| 435 | } |
| 436 | |
| 437 | while (*tmpp != '\0') { |
| 438 | HTMLAttr *attr; |
| 439 | gchar *attr_name; |
| 440 | gchar *attr_value; |
| 441 | gchar *p; |
| 442 | gchar quote; |
| 443 | |
| 444 | while (g_ascii_isspace(*tmpp)) tmpp++;
|
| 445 | if (tmpp > tmp && *tmpp == '/') |
| 446 | break;
|
| 447 | attr_name = tmpp; |
| 448 | |
| 449 | while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) && |
| 450 | *tmpp != '=')
|
| 451 | tmpp++; |
| 452 | if (*tmpp != '\0' && *tmpp != '=') { |
| 453 | *tmpp++ = '\0';
|
| 454 | while (g_ascii_isspace(*tmpp)) tmpp++;
|
| 455 | } |
| 456 | |
| 457 | if (*tmpp == '=') { |
| 458 | *tmpp++ = '\0';
|
| 459 | while (g_ascii_isspace(*tmpp)) tmpp++;
|
| 460 | |
| 461 | if (*tmpp == '"' || *tmpp == '\'') { |
| 462 | /* name="value" */
|
| 463 | quote = *tmpp; |
| 464 | tmpp++; |
| 465 | attr_value = tmpp; |
| 466 | if ((p = strchr(attr_value, quote)) == NULL) { |
| 467 | g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
|
| 468 | break;
|
| 469 | } |
| 470 | tmpp = p; |
| 471 | *tmpp++ = '\0';
|
| 472 | while (g_ascii_isspace(*tmpp)) tmpp++;
|
| 473 | } else {
|
| 474 | /* name=value */
|
| 475 | attr_value = tmpp; |
| 476 | while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++; |
| 477 | if (*tmpp != '\0') |
| 478 | *tmpp++ = '\0';
|
| 479 | } |
| 480 | } else
|
| 481 | attr_value = "";
|
| 482 | |
| 483 | g_strchomp(attr_name); |
| 484 | g_strdown(attr_name); |
| 485 | attr = g_new(HTMLAttr, 1);
|
| 486 | attr->name = g_strdup(attr_name); |
| 487 | attr->value = g_strdup(attr_value); |
| 488 | tag->attr = g_list_append(tag->attr, attr); |
| 489 | } |
| 490 | |
| 491 | g_free(tmp); |
| 492 | |
| 493 | return tag;
|
| 494 | } |
| 495 | |
| 496 | static void html_free_tag(HTMLTag *tag) |
| 497 | {
|
| 498 | if (!tag) return; |
| 499 | |
| 500 | g_free(tag->name); |
| 501 | while (tag->attr != NULL) { |
| 502 | HTMLAttr *attr = (HTMLAttr *)tag->attr->data; |
| 503 | g_free(attr->name); |
| 504 | g_free(attr->value); |
| 505 | g_free(attr); |
| 506 | tag->attr = g_list_remove(tag->attr, tag->attr->data); |
| 507 | } |
| 508 | g_free(tag); |
| 509 | } |
| 510 | |
| 511 | static HTMLState html_parse_tag(HTMLParser *parser)
|
| 512 | {
|
| 513 | gchar buf[HTMLBUFSIZE]; |
| 514 | HTMLTag *tag; |
| 515 | |
| 516 | html_get_parenthesis(parser, buf, sizeof(buf));
|
| 517 | |
| 518 | tag = html_get_tag(buf); |
| 519 | |
| 520 | parser->state = HTML_UNKNOWN; |
| 521 | if (!tag) return HTML_UNKNOWN; |
| 522 | |
| 523 | if (!strcmp(tag->name, "br")) { |
| 524 | parser->space = FALSE; |
| 525 | html_append_char(parser, '\n');
|
| 526 | parser->state = HTML_BR; |
| 527 | } else if (!strcmp(tag->name, "a")) { |
| 528 | GList *cur; |
| 529 | |
| 530 | for (cur = tag->attr; cur != NULL; cur = cur->next) { |
| 531 | HTMLAttr *attr = (HTMLAttr *)cur->data; |
| 532 | |
| 533 | if (attr && !strcmp(attr->name, "href")) { |
| 534 | g_free(parser->href); |
| 535 | parser->href = g_strdup(attr->value); |
| 536 | parser->state = HTML_HREF; |
| 537 | break;
|
| 538 | } |
| 539 | } |
| 540 | } else if (!strcmp(tag->name, "/a")) { |
| 541 | g_free(parser->href); |
| 542 | parser->href = NULL;
|
| 543 | parser->state = HTML_NORMAL; |
| 544 | } else if (!strcmp(tag->name, "p")) { |
| 545 | parser->space = FALSE; |
| 546 | if (!parser->empty_line) {
|
| 547 | parser->space = FALSE; |
| 548 | if (!parser->newline) html_append_char(parser, '\n'); |
| 549 | html_append_char(parser, '\n');
|
| 550 | } |
| 551 | parser->state = HTML_PAR; |
| 552 | } else if (!strcmp(tag->name, "pre")) { |
| 553 | parser->pre = TRUE; |
| 554 | parser->state = HTML_PRE; |
| 555 | } else if (!strcmp(tag->name, "/pre")) { |
| 556 | parser->pre = FALSE; |
| 557 | parser->state = HTML_NORMAL; |
| 558 | } else if (!strcmp(tag->name, "hr")) { |
| 559 | if (!parser->newline) {
|
| 560 | parser->space = FALSE; |
| 561 | html_append_char(parser, '\n');
|
| 562 | } |
| 563 | html_append_str(parser, HR_STR "\n", -1); |
| 564 | parser->state = HTML_HR; |
| 565 | } else if (!strcmp(tag->name, "div") || |
| 566 | !strcmp(tag->name, "ul") ||
|
| 567 | !strcmp(tag->name, "li") ||
|
| 568 | !strcmp(tag->name, "table") ||
|
| 569 | !strcmp(tag->name, "tr") ||
|
| 570 | (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1]))) { |
| 571 | if (!parser->newline) {
|
| 572 | parser->space = FALSE; |
| 573 | html_append_char(parser, '\n');
|
| 574 | } |
| 575 | parser->state = HTML_NORMAL; |
| 576 | } else if (!strcmp(tag->name, "/table") || |
| 577 | (tag->name[0] == '/' && |
| 578 | tag->name[1] == 'h' && |
| 579 | g_ascii_isdigit(tag->name[1]))) {
|
| 580 | if (!parser->empty_line) {
|
| 581 | parser->space = FALSE; |
| 582 | if (!parser->newline) html_append_char(parser, '\n'); |
| 583 | html_append_char(parser, '\n');
|
| 584 | } |
| 585 | parser->state = HTML_NORMAL; |
| 586 | } else if (!strcmp(tag->name, "/div") || |
| 587 | !strcmp(tag->name, "/ul") ||
|
| 588 | !strcmp(tag->name, "/li")) {
|
| 589 | if (!parser->newline) {
|
| 590 | parser->space = FALSE; |
| 591 | html_append_char(parser, '\n');
|
| 592 | } |
| 593 | parser->state = HTML_NORMAL; |
| 594 | } |
| 595 | |
| 596 | html_free_tag(tag); |
| 597 | |
| 598 | return parser->state;
|
| 599 | } |
| 600 | |
| 601 | static void html_parse_special(HTMLParser *parser) |
| 602 | {
|
| 603 | gchar symbol_name[9];
|
| 604 | gint n; |
| 605 | const gchar *val;
|
| 606 | |
| 607 | parser->state = HTML_UNKNOWN; |
| 608 | g_return_if_fail(*parser->bufp == '&');
|
| 609 | |
| 610 | /* &foo; */
|
| 611 | for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++) |
| 612 | ; |
| 613 | if (n > 7 || parser->bufp[n] != ';') { |
| 614 | /* output literal `&' */
|
| 615 | html_append_char(parser, *parser->bufp++); |
| 616 | parser->state = HTML_NORMAL; |
| 617 | return;
|
| 618 | } |
| 619 | strncpy2(symbol_name, parser->bufp, n + 2);
|
| 620 | parser->bufp += n + 1;
|
| 621 | |
| 622 | if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
|
| 623 | != NULL) {
|
| 624 | html_append_str(parser, val, -1);
|
| 625 | parser->state = HTML_NORMAL; |
| 626 | return;
|
| 627 | } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) { |
| 628 | gint ch; |
| 629 | |
| 630 | ch = atoi(symbol_name + 2);
|
| 631 | if (ch < 128 && g_ascii_isprint(ch)) { |
| 632 | html_append_char(parser, ch); |
| 633 | parser->state = HTML_NORMAL; |
| 634 | return;
|
| 635 | } else {
|
| 636 | /* ISO 10646 to UTF-8 */
|
| 637 | gchar buf[6];
|
| 638 | gint len; |
| 639 | |
| 640 | len = g_unichar_to_utf8((gunichar)ch, buf); |
| 641 | if (len > 0) { |
| 642 | html_append_str(parser, buf, len); |
| 643 | parser->state = HTML_NORMAL; |
| 644 | return;
|
| 645 | } |
| 646 | } |
| 647 | } |
| 648 | |
| 649 | html_append_str(parser, symbol_name, -1);
|
| 650 | } |
| 651 | |
| 652 | static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len) |
| 653 | {
|
| 654 | gchar *p; |
| 655 | |
| 656 | buf[0] = '\0'; |
| 657 | g_return_if_fail(*parser->bufp == '<');
|
| 658 | |
| 659 | /* ignore comment / CSS / script stuff */
|
| 660 | if (!strncmp(parser->bufp, "<!--", 4)) { |
| 661 | parser->bufp += 4;
|
| 662 | if ((p = html_find_str(parser, "-->")) != NULL) |
| 663 | parser->bufp = p + 3;
|
| 664 | return;
|
| 665 | } |
| 666 | if (!g_ascii_strncasecmp(parser->bufp, "<style", 6)) { |
| 667 | parser->bufp += 6;
|
| 668 | if ((p = html_find_str_case(parser, "</style")) != NULL) { |
| 669 | parser->bufp = p + 7;
|
| 670 | if ((p = html_find_char(parser, '>')) != NULL) |
| 671 | parser->bufp = p + 1;
|
| 672 | } |
| 673 | return;
|
| 674 | } |
| 675 | if (!g_ascii_strncasecmp(parser->bufp, "<script", 7)) { |
| 676 | parser->bufp += 7;
|
| 677 | if ((p = html_find_str_case(parser, "</script")) != NULL) { |
| 678 | parser->bufp = p + 8;
|
| 679 | if ((p = html_find_char(parser, '>')) != NULL) |
| 680 | parser->bufp = p + 1;
|
| 681 | } |
| 682 | return;
|
| 683 | } |
| 684 | |
| 685 | parser->bufp++; |
| 686 | if ((p = html_find_char(parser, '>')) == NULL) |
| 687 | return;
|
| 688 | |
| 689 | strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
|
| 690 | g_strstrip(buf); |
| 691 | parser->bufp = p + 1;
|
| 692 | } |