root / src / html.c @ 1
History | View | Annotate | Download (17.6 kB)
| 1 | /*
|
|---|---|
| 2 | * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client |
| 3 | * Copyright (C) 1999-2003 Hiroyuki Yamamoto |
| 4 | * |
| 5 | * This program is free software; you can redistribute it and/or modify |
| 6 | * it under the terms of the GNU General Public License as published by |
| 7 | * the Free Software Foundation; either version 2 of the License, or |
| 8 | * (at your option) any later version. |
| 9 | * |
| 10 | * This program is distributed in the hope that it will be useful, |
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | * GNU General Public License for more details. |
| 14 | * |
| 15 | * You should have received a copy of the GNU General Public License |
| 16 | * along with this program; if not, write to the Free Software |
| 17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 18 | */ |
| 19 | |
| 20 | #include <glib.h> |
| 21 | #include <stdio.h> |
| 22 | #include <string.h> |
| 23 | #include <ctype.h> |
| 24 | |
| 25 | #include "html.h" |
| 26 | #include "codeconv.h" |
| 27 | #include "utils.h" |
| 28 | |
| 29 | #define HTMLBUFSIZE 8192 |
| 30 | #define HR_STR "------------------------------------------------" |
| 31 | |
| 32 | typedef struct _HTMLSymbol HTMLSymbol; |
| 33 | |
| 34 | struct _HTMLSymbol
|
| 35 | {
|
| 36 | gchar *const key;
|
| 37 | gchar *const val;
|
| 38 | }; |
| 39 | |
| 40 | static HTMLSymbol symbol_list[] = {
|
| 41 | {"<" , "<"},
|
| 42 | {">" , ">"},
|
| 43 | {"&" , "&"},
|
| 44 | {""" , "\""},
|
| 45 | {" " , " "},
|
| 46 | {"™" , "(TM)"},
|
| 47 | |
| 48 | {"™", "(TM)"},
|
| 49 | }; |
| 50 | |
| 51 | static HTMLSymbol ascii_symbol_list[] = {
|
| 52 | {"¡" , "^!"},
|
| 53 | {"¦", "|"},
|
| 54 | {"©" , "(C)"},
|
| 55 | {"«" , "<<"},
|
| 56 | {"®" , "(R)"},
|
| 57 | |
| 58 | {"²" , "^2"},
|
| 59 | {"³" , "^3"},
|
| 60 | {"´" , "'"},
|
| 61 | {"¸" , ","},
|
| 62 | {"¹" , "^1"},
|
| 63 | {"»" , ">>"},
|
| 64 | {"¼", "1/4"},
|
| 65 | {"½", "1/2"},
|
| 66 | {"¾", "3/4"},
|
| 67 | {"¿", "^?"},
|
| 68 | |
| 69 | {"À", "A`"},
|
| 70 | {"Á", "A'"},
|
| 71 | {"Â" , "A^"},
|
| 72 | {"Ã", "A~"},
|
| 73 | {"Æ" , "AE"},
|
| 74 | {"È", "E`"},
|
| 75 | {"É", "E'"},
|
| 76 | {"Ê" , "E^"},
|
| 77 | {"Ì", "I`"},
|
| 78 | {"Í", "I'"},
|
| 79 | {"Î" , "I^"},
|
| 80 | |
| 81 | {"Ñ", "N~"},
|
| 82 | {"Ò", "O`"},
|
| 83 | {"Ó", "O'"},
|
| 84 | {"Ô" , "O^"},
|
| 85 | {"Õ", "O~"},
|
| 86 | {"Ù", "U`"},
|
| 87 | {"Ú", "U'"},
|
| 88 | {"Û" , "U^"},
|
| 89 | {"Ý", "Y'"},
|
| 90 | |
| 91 | {"à", "a`"},
|
| 92 | {"á", "a'"},
|
| 93 | {"â" , "a^"},
|
| 94 | {"ã", "a~"},
|
| 95 | {"æ" , "ae"},
|
| 96 | {"è", "e`"},
|
| 97 | {"é", "e'"},
|
| 98 | {"ê" , "e^"},
|
| 99 | {"ì", "i`"},
|
| 100 | {"í", "i'"},
|
| 101 | {"î" , "i^"},
|
| 102 | |
| 103 | {"ñ", "n~"},
|
| 104 | {"ò", "o`"},
|
| 105 | {"ó", "o'"},
|
| 106 | {"ô" , "o^"},
|
| 107 | {"õ", "o~"},
|
| 108 | {"ù", "u`"},
|
| 109 | {"ú", "u'"},
|
| 110 | {"û" , "u^"},
|
| 111 | {"ý", "y'"},
|
| 112 | }; |
| 113 | |
| 114 | static HTMLSymbol eucjp_symbol_list[] = {
|
| 115 | {"¡" , "^!"},
|
| 116 | {"¢" , "\xa1\xf1"},
|
| 117 | {"£" , "\xa1\xf2"},
|
| 118 | {"¥" , "\xa1\xef"},
|
| 119 | {"¦", "|"},
|
| 120 | {"§" , "\xa1\xf8"},
|
| 121 | {"¨" , "\xa1\xaf"},
|
| 122 | {"©" , "(C)"},
|
| 123 | {"«" , "<<"},
|
| 124 | {"®" , "(R)"},
|
| 125 | |
| 126 | {"°" , "\xa1\xeb"},
|
| 127 | {"±", "\xa1\xde"},
|
| 128 | {"²" , "^2"},
|
| 129 | {"³" , "^3"},
|
| 130 | {"´" , "'"},
|
| 131 | {"µ" , "\xa6\xcc"},
|
| 132 | {"¶" , "\xa2\xf9"},
|
| 133 | {"·", "\xa1\xa6"},
|
| 134 | {"¸" , ","},
|
| 135 | {"¹" , "^1"},
|
| 136 | {"»" , ">>"},
|
| 137 | {"¼", "1/4"},
|
| 138 | {"½", "1/2"},
|
| 139 | {"¾", "3/4"},
|
| 140 | {"¿", "^?"},
|
| 141 | |
| 142 | {"À", "A`"},
|
| 143 | {"Á", "A'"},
|
| 144 | {"Â" , "A^"},
|
| 145 | {"Ã", "A~"},
|
| 146 | {"Ä" , "A\xa1\xaf"},
|
| 147 | {"Å" , "A\xa1\xeb"},
|
| 148 | {"Æ" , "AE"},
|
| 149 | {"È", "E`"},
|
| 150 | {"É", "E'"},
|
| 151 | {"Ê" , "E^"},
|
| 152 | {"Ë" , "E\xa1\xaf"},
|
| 153 | {"Ì", "I`"},
|
| 154 | {"Í", "I'"},
|
| 155 | {"Î" , "I^"},
|
| 156 | {"Ï" , "I\xa1\xaf"},
|
| 157 | |
| 158 | {"Ñ", "N~"},
|
| 159 | {"Ò", "O`"},
|
| 160 | {"Ó", "O'"},
|
| 161 | {"Ô" , "O^"},
|
| 162 | {"Õ", "O~"},
|
| 163 | {"Ö" , "O\xa1\xaf"},
|
| 164 | {"×" , "\xa1\xdf"},
|
| 165 | {"Ù", "U`"},
|
| 166 | {"Ú", "U'"},
|
| 167 | {"Û" , "U^"},
|
| 168 | {"Ü" , "U\xa1\xaf"},
|
| 169 | {"Ý", "Y'"},
|
| 170 | |
| 171 | {"à", "a`"},
|
| 172 | {"á", "a'"},
|
| 173 | {"â" , "a^"},
|
| 174 | {"ã", "a~"},
|
| 175 | {"ä" , "a\xa1\xaf"},
|
| 176 | {"å" , "a\xa1\xeb"},
|
| 177 | {"æ" , "ae"},
|
| 178 | {"è", "e`"},
|
| 179 | {"é", "e'"},
|
| 180 | {"ê" , "e^"},
|
| 181 | {"ë" , "e\xa1\xaf"},
|
| 182 | {"ì", "i`"},
|
| 183 | {"í", "i'"},
|
| 184 | {"î" , "i^"},
|
| 185 | {"ï" , "i\xa1\xaf"},
|
| 186 | |
| 187 | {"ð" , "\xa2\xdf"},
|
| 188 | {"ñ", "n~"},
|
| 189 | {"ò", "o`"},
|
| 190 | {"ó", "o'"},
|
| 191 | {"ô" , "o^"},
|
| 192 | {"õ", "o~"},
|
| 193 | {"ö" , "o\xa1\xaf"},
|
| 194 | {"÷", "\xa1\xe0"},
|
| 195 | {"ù", "u`"},
|
| 196 | {"ú", "u'"},
|
| 197 | {"û" , "u^"},
|
| 198 | {"ü" , "u\xa1\xaf"},
|
| 199 | {"ý", "y'"},
|
| 200 | {"ÿ" , "y\xa1\xaf"},
|
| 201 | }; |
| 202 | |
| 203 | static HTMLSymbol latin_symbol_list[] = {
|
| 204 | {"¡" , "\xa1"},
|
| 205 | {"¢" , "\xa2"},
|
| 206 | {"£" , "\xa3"},
|
| 207 | {"¤", "\xa4"},
|
| 208 | {"¥" , "\xa5"},
|
| 209 | {"¦", "\xa6"},
|
| 210 | {"§" , "\xa7"},
|
| 211 | {"¨" , "\xa8"},
|
| 212 | {"©" , "\xa9"},
|
| 213 | {"ª" , "\xaa"},
|
| 214 | {"«" , "\xab"},
|
| 215 | {"¬" , "\xac"},
|
| 216 | {"­" , "\xad"},
|
| 217 | {"®" , "\xae"},
|
| 218 | {"¯" , "\xaf"},
|
| 219 | |
| 220 | {"°" , "\xb0"},
|
| 221 | {"±", "\xb1"},
|
| 222 | {"²" , "\xb2"},
|
| 223 | {"³" , "\xb3"},
|
| 224 | {"´" , "\xb4"},
|
| 225 | {"µ" , "\xb5"},
|
| 226 | {"¶" , "\xb6"},
|
| 227 | {"·", "\xb7"},
|
| 228 | {"¸" , "\xb8"},
|
| 229 | {"¹" , "\xb9"},
|
| 230 | {"º" , "\xba"},
|
| 231 | {"»" , "\xbb"},
|
| 232 | {"¼", "\xbc"},
|
| 233 | {"½", "\xbd"},
|
| 234 | {"¾", "\xbe"},
|
| 235 | {"¿", "\xbf"},
|
| 236 | |
| 237 | {"À", "\xc0"},
|
| 238 | {"Á", "\xc1"},
|
| 239 | {"Â" , "\xc2"},
|
| 240 | {"Ã", "\xc3"},
|
| 241 | {"Ä" , "\xc4"},
|
| 242 | {"Å" , "\xc5"},
|
| 243 | {"Æ" , "\xc6"},
|
| 244 | {"Ç", "\xc7"},
|
| 245 | {"È", "\xc8"},
|
| 246 | {"É", "\xc9"},
|
| 247 | {"Ê" , "\xca"},
|
| 248 | {"Ë" , "\xcb"},
|
| 249 | {"Ì", "\xcc"},
|
| 250 | {"Í", "\xcd"},
|
| 251 | {"Î" , "\xce"},
|
| 252 | {"Ï" , "\xcf"},
|
| 253 | |
| 254 | {"Ð" , "\xd0"},
|
| 255 | {"Ñ", "\xd1"},
|
| 256 | {"Ò", "\xd2"},
|
| 257 | {"Ó", "\xd3"},
|
| 258 | {"Ô" , "\xd4"},
|
| 259 | {"Õ", "\xd5"},
|
| 260 | {"Ö" , "\xd6"},
|
| 261 | {"×" , "\xd7"},
|
| 262 | {"Ø", "\xd8"},
|
| 263 | {"Ù", "\xd9"},
|
| 264 | {"Ú", "\xda"},
|
| 265 | {"Û" , "\xdb"},
|
| 266 | {"Ü" , "\xdc"},
|
| 267 | {"Ý", "\xdd"},
|
| 268 | {"Þ" , "\xde"},
|
| 269 | {"ß" , "\xdf"},
|
| 270 | |
| 271 | {"à", "\xe0"},
|
| 272 | {"á", "\xe1"},
|
| 273 | {"â" , "\xe2"},
|
| 274 | {"ã", "\xe3"},
|
| 275 | {"ä" , "\xe4"},
|
| 276 | {"å" , "\xe5"},
|
| 277 | {"æ" , "\xe6"},
|
| 278 | {"ç", "\xe7"},
|
| 279 | {"è", "\xe8"},
|
| 280 | {"é", "\xe9"},
|
| 281 | {"ê" , "\xea"},
|
| 282 | {"ë" , "\xeb"},
|
| 283 | {"ì", "\xec"},
|
| 284 | {"í", "\xed"},
|
| 285 | {"î" , "\xee"},
|
| 286 | {"ï" , "\xef"},
|
| 287 | |
| 288 | {"ð" , "\xf0"},
|
| 289 | {"ñ", "\xf1"},
|
| 290 | {"ò", "\xf2"},
|
| 291 | {"ó", "\xf3"},
|
| 292 | {"ô" , "\xf4"},
|
| 293 | {"õ", "\xf5"},
|
| 294 | {"ö" , "\xf6"},
|
| 295 | {"÷", "\xf7"},
|
| 296 | {"ø", "\xf8"},
|
| 297 | {"ù", "\xf9"},
|
| 298 | {"ú", "\xfa"},
|
| 299 | {"û" , "\xfb"},
|
| 300 | {"ü" , "\xfc"},
|
| 301 | {"ý", "\xfd"},
|
| 302 | {"þ" , "\xfe"},
|
| 303 | {"ÿ" , "\xff"},
|
| 304 | }; |
| 305 | |
| 306 | static GHashTable *default_symbol_table;
|
| 307 | static GHashTable *eucjp_symbol_table;
|
| 308 | static GHashTable *latin_symbol_table;
|
| 309 | |
| 310 | static HTMLState html_read_line (HTMLParser *parser);
|
| 311 | static void html_append_char (HTMLParser *parser, |
| 312 | gchar ch); |
| 313 | static void html_append_str (HTMLParser *parser, |
| 314 | const gchar *str,
|
| 315 | gint len); |
| 316 | static HTMLState html_parse_tag (HTMLParser *parser);
|
| 317 | static void html_parse_special (HTMLParser *parser); |
| 318 | static void html_get_parenthesis (HTMLParser *parser, |
| 319 | gchar *buf, |
| 320 | gint len); |
| 321 | |
| 322 | |
| 323 | HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv) |
| 324 | {
|
| 325 | HTMLParser *parser; |
| 326 | |
| 327 | g_return_val_if_fail(fp != NULL, NULL); |
| 328 | g_return_val_if_fail(conv != NULL, NULL); |
| 329 | |
| 330 | parser = g_new0(HTMLParser, 1);
|
| 331 | parser->fp = fp; |
| 332 | parser->conv = conv; |
| 333 | parser->str = g_string_new(NULL);
|
| 334 | parser->buf = g_string_new(NULL);
|
| 335 | parser->bufp = parser->buf->str; |
| 336 | parser->state = HTML_NORMAL; |
| 337 | parser->href = NULL;
|
| 338 | parser->newline = TRUE; |
| 339 | parser->empty_line = TRUE; |
| 340 | parser->space = FALSE; |
| 341 | parser->pre = FALSE; |
| 342 | |
| 343 | #define SYMBOL_TABLE_ADD(table, list) \
|
| 344 | { \
|
| 345 | gint i; \ |
| 346 | \ |
| 347 | for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \ |
| 348 | g_hash_table_insert(table, list[i].key, list[i].val); \ |
| 349 | } |
| 350 | |
| 351 | if (!default_symbol_table) {
|
| 352 | default_symbol_table = |
| 353 | g_hash_table_new(g_str_hash, g_str_equal); |
| 354 | SYMBOL_TABLE_ADD(default_symbol_table, symbol_list); |
| 355 | SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list); |
| 356 | } |
| 357 | if (!eucjp_symbol_table) {
|
| 358 | eucjp_symbol_table = |
| 359 | g_hash_table_new(g_str_hash, g_str_equal); |
| 360 | SYMBOL_TABLE_ADD(eucjp_symbol_table, symbol_list); |
| 361 | SYMBOL_TABLE_ADD(eucjp_symbol_table, eucjp_symbol_list); |
| 362 | } |
| 363 | if (!latin_symbol_table) {
|
| 364 | latin_symbol_table = |
| 365 | g_hash_table_new(g_str_hash, g_str_equal); |
| 366 | SYMBOL_TABLE_ADD(latin_symbol_table, symbol_list); |
| 367 | SYMBOL_TABLE_ADD(latin_symbol_table, latin_symbol_list); |
| 368 | } |
| 369 | |
| 370 | #undef SYMBOL_TABLE_ADD
|
| 371 | |
| 372 | if (conv->charset == C_ISO_8859_1)
|
| 373 | parser->symbol_table = latin_symbol_table; |
| 374 | else if ((conv->charset == C_ISO_2022_JP || |
| 375 | conv->charset == C_ISO_2022_JP_2 || |
| 376 | conv->charset == C_EUC_JP || |
| 377 | conv->charset == C_SHIFT_JIS) && |
| 378 | conv_get_locale_charset() == C_EUC_JP) |
| 379 | parser->symbol_table = eucjp_symbol_table; |
| 380 | else
|
| 381 | parser->symbol_table = default_symbol_table; |
| 382 | |
| 383 | return parser;
|
| 384 | } |
| 385 | |
| 386 | void html_parser_destroy(HTMLParser *parser)
|
| 387 | {
|
| 388 | g_string_free(parser->str, TRUE); |
| 389 | g_string_free(parser->buf, TRUE); |
| 390 | g_free(parser->href); |
| 391 | g_free(parser); |
| 392 | } |
| 393 | |
| 394 | gchar *html_parse(HTMLParser *parser) |
| 395 | {
|
| 396 | parser->state = HTML_NORMAL; |
| 397 | g_string_truncate(parser->str, 0);
|
| 398 | |
| 399 | if (*parser->bufp == '\0') { |
| 400 | g_string_truncate(parser->buf, 0);
|
| 401 | parser->bufp = parser->buf->str; |
| 402 | if (html_read_line(parser) == HTML_EOF)
|
| 403 | return NULL; |
| 404 | } |
| 405 | |
| 406 | while (*parser->bufp != '\0') { |
| 407 | switch (*parser->bufp) {
|
| 408 | case '<': |
| 409 | if (parser->str->len == 0) |
| 410 | html_parse_tag(parser); |
| 411 | else
|
| 412 | return parser->str->str;
|
| 413 | break;
|
| 414 | case '&': |
| 415 | html_parse_special(parser); |
| 416 | break;
|
| 417 | case ' ': |
| 418 | case '\t': |
| 419 | case '\r': |
| 420 | case '\n': |
| 421 | if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n') |
| 422 | parser->bufp++; |
| 423 | |
| 424 | if (!parser->pre) {
|
| 425 | if (!parser->newline)
|
| 426 | parser->space = TRUE; |
| 427 | |
| 428 | parser->bufp++; |
| 429 | break;
|
| 430 | } |
| 431 | /* fallthrough */
|
| 432 | default:
|
| 433 | html_append_char(parser, *parser->bufp++); |
| 434 | } |
| 435 | } |
| 436 | |
| 437 | return parser->str->str;
|
| 438 | } |
| 439 | |
| 440 | static HTMLState html_read_line(HTMLParser *parser)
|
| 441 | {
|
| 442 | gchar buf[HTMLBUFSIZE]; |
| 443 | gchar buf2[HTMLBUFSIZE]; |
| 444 | gint index; |
| 445 | |
| 446 | if (fgets(buf, sizeof(buf), parser->fp) == NULL) { |
| 447 | parser->state = HTML_EOF; |
| 448 | return HTML_EOF;
|
| 449 | } |
| 450 | |
| 451 | if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) { |
| 452 | index = parser->bufp - parser->buf->str; |
| 453 | |
| 454 | conv_localetodisp(buf2, sizeof(buf2), buf);
|
| 455 | g_string_append(parser->buf, buf2); |
| 456 | |
| 457 | parser->bufp = parser->buf->str + index; |
| 458 | |
| 459 | return HTML_CONV_FAILED;
|
| 460 | } |
| 461 | |
| 462 | index = parser->bufp - parser->buf->str; |
| 463 | |
| 464 | g_string_append(parser->buf, buf2); |
| 465 | |
| 466 | parser->bufp = parser->buf->str + index; |
| 467 | |
| 468 | return HTML_NORMAL;
|
| 469 | } |
| 470 | |
| 471 | static void html_append_char(HTMLParser *parser, gchar ch) |
| 472 | {
|
| 473 | GString *str = parser->str; |
| 474 | |
| 475 | if (!parser->pre && parser->space) {
|
| 476 | g_string_append_c(str, ' ');
|
| 477 | parser->space = FALSE; |
| 478 | } |
| 479 | |
| 480 | g_string_append_c(str, ch); |
| 481 | |
| 482 | parser->empty_line = FALSE; |
| 483 | if (ch == '\n') { |
| 484 | parser->newline = TRUE; |
| 485 | if (str->len > 1 && str->str[str->len - 2] == '\n') |
| 486 | parser->empty_line = TRUE; |
| 487 | } else
|
| 488 | parser->newline = FALSE; |
| 489 | } |
| 490 | |
| 491 | static void html_append_str(HTMLParser *parser, const gchar *str, gint len) |
| 492 | {
|
| 493 | GString *string = parser->str; |
| 494 | |
| 495 | if (!parser->pre && parser->space) {
|
| 496 | g_string_append_c(string, ' ');
|
| 497 | parser->space = FALSE; |
| 498 | } |
| 499 | |
| 500 | if (len == 0) return; |
| 501 | if (len < 0) |
| 502 | g_string_append(string, str); |
| 503 | else {
|
| 504 | gchar *s; |
| 505 | Xstrndup_a(s, str, len, return);
|
| 506 | g_string_append(string, s); |
| 507 | } |
| 508 | |
| 509 | parser->empty_line = FALSE; |
| 510 | if (string->len > 0 && string->str[string->len - 1] == '\n') { |
| 511 | parser->newline = TRUE; |
| 512 | if (string->len > 1 && string->str[string->len - 2] == '\n') |
| 513 | parser->empty_line = TRUE; |
| 514 | } else
|
| 515 | parser->newline = FALSE; |
| 516 | } |
| 517 | |
| 518 | static HTMLTag *html_get_tag(const gchar *str) |
| 519 | {
|
| 520 | HTMLTag *tag; |
| 521 | gchar *tmp; |
| 522 | guchar *tmpp; |
| 523 | |
| 524 | g_return_val_if_fail(str != NULL, NULL); |
| 525 | |
| 526 | if (*str == '\0' || *str == '!') return NULL; |
| 527 | |
| 528 | Xstrdup_a(tmp, str, return NULL); |
| 529 | |
| 530 | tag = g_new0(HTMLTag, 1);
|
| 531 | |
| 532 | for (tmpp = tmp; *tmpp != '\0' && !isspace(*tmpp); tmpp++) |
| 533 | ; |
| 534 | |
| 535 | if (*tmpp == '\0') { |
| 536 | g_strdown(tmp); |
| 537 | tag->name = g_strdup(tmp); |
| 538 | return tag;
|
| 539 | } else {
|
| 540 | *tmpp++ = '\0';
|
| 541 | g_strdown(tmp); |
| 542 | tag->name = g_strdup(tmp); |
| 543 | } |
| 544 | |
| 545 | while (*tmpp != '\0') { |
| 546 | HTMLAttr *attr; |
| 547 | gchar *attr_name; |
| 548 | gchar *attr_value; |
| 549 | gchar *p; |
| 550 | gchar quote; |
| 551 | |
| 552 | while (isspace(*tmpp)) tmpp++;
|
| 553 | attr_name = tmpp; |
| 554 | |
| 555 | while (*tmpp != '\0' && !isspace(*tmpp) && *tmpp != '=') tmpp++; |
| 556 | if (*tmpp != '\0' && *tmpp != '=') { |
| 557 | *tmpp++ = '\0';
|
| 558 | while (isspace(*tmpp)) tmpp++;
|
| 559 | } |
| 560 | |
| 561 | if (*tmpp == '=') { |
| 562 | *tmpp++ = '\0';
|
| 563 | while (isspace(*tmpp)) tmpp++;
|
| 564 | |
| 565 | if (*tmpp == '"' || *tmpp == '\'') { |
| 566 | /* name="value" */
|
| 567 | quote = *tmpp; |
| 568 | tmpp++; |
| 569 | attr_value = tmpp; |
| 570 | if ((p = strchr(attr_value, quote)) == NULL) { |
| 571 | g_warning("html_get_tag(): syntax error in tag: '%s'\n", str);
|
| 572 | return tag;
|
| 573 | } |
| 574 | tmpp = p; |
| 575 | *tmpp++ = '\0';
|
| 576 | while (isspace(*tmpp)) tmpp++;
|
| 577 | } else {
|
| 578 | /* name=value */
|
| 579 | attr_value = tmpp; |
| 580 | while (*tmpp != '\0' && !isspace(*tmpp)) tmpp++; |
| 581 | if (*tmpp != '\0') |
| 582 | *tmpp++ = '\0';
|
| 583 | } |
| 584 | } else
|
| 585 | attr_value = "";
|
| 586 | |
| 587 | g_strchomp(attr_name); |
| 588 | g_strdown(attr_name); |
| 589 | attr = g_new(HTMLAttr, 1);
|
| 590 | attr->name = g_strdup(attr_name); |
| 591 | attr->value = g_strdup(attr_value); |
| 592 | tag->attr = g_list_append(tag->attr, attr); |
| 593 | } |
| 594 | |
| 595 | return tag;
|
| 596 | } |
| 597 | |
| 598 | static void html_free_tag(HTMLTag *tag) |
| 599 | {
|
| 600 | if (!tag) return; |
| 601 | |
| 602 | g_free(tag->name); |
| 603 | while (tag->attr != NULL) { |
| 604 | HTMLAttr *attr = (HTMLAttr *)tag->attr->data; |
| 605 | g_free(attr->name); |
| 606 | g_free(attr->value); |
| 607 | g_free(attr); |
| 608 | tag->attr = g_list_remove(tag->attr, tag->attr->data); |
| 609 | } |
| 610 | g_free(tag); |
| 611 | } |
| 612 | |
| 613 | static HTMLState html_parse_tag(HTMLParser *parser)
|
| 614 | {
|
| 615 | gchar buf[HTMLBUFSIZE]; |
| 616 | HTMLTag *tag; |
| 617 | |
| 618 | html_get_parenthesis(parser, buf, sizeof(buf));
|
| 619 | |
| 620 | tag = html_get_tag(buf); |
| 621 | |
| 622 | parser->state = HTML_UNKNOWN; |
| 623 | if (!tag) return HTML_UNKNOWN; |
| 624 | |
| 625 | if (!strcmp(tag->name, "br")) { |
| 626 | parser->space = FALSE; |
| 627 | html_append_char(parser, '\n');
|
| 628 | parser->state = HTML_BR; |
| 629 | } else if (!strcmp(tag->name, "a")) { |
| 630 | if (tag->attr && tag->attr->data &&
|
| 631 | !strcmp(((HTMLAttr *)tag->attr->data)->name, "href")) {
|
| 632 | g_free(parser->href); |
| 633 | parser->href = |
| 634 | g_strdup(((HTMLAttr *)tag->attr->data)->value); |
| 635 | parser->state = HTML_HREF; |
| 636 | } |
| 637 | } else if (!strcmp(tag->name, "/a")) { |
| 638 | g_free(parser->href); |
| 639 | parser->href = NULL;
|
| 640 | parser->state = HTML_NORMAL; |
| 641 | } else if (!strcmp(tag->name, "p")) { |
| 642 | parser->space = FALSE; |
| 643 | if (!parser->empty_line) {
|
| 644 | parser->space = FALSE; |
| 645 | if (!parser->newline) html_append_char(parser, '\n'); |
| 646 | html_append_char(parser, '\n');
|
| 647 | } |
| 648 | parser->state = HTML_PAR; |
| 649 | } else if (!strcmp(tag->name, "pre")) { |
| 650 | parser->pre = TRUE; |
| 651 | parser->state = HTML_PRE; |
| 652 | } else if (!strcmp(tag->name, "/pre")) { |
| 653 | parser->pre = FALSE; |
| 654 | parser->state = HTML_NORMAL; |
| 655 | } else if (!strcmp(tag->name, "hr")) { |
| 656 | if (!parser->newline) {
|
| 657 | parser->space = FALSE; |
| 658 | html_append_char(parser, '\n');
|
| 659 | } |
| 660 | html_append_str(parser, HR_STR "\n", -1); |
| 661 | parser->state = HTML_HR; |
| 662 | } else if (!strcmp(tag->name, "div") || |
| 663 | !strcmp(tag->name, "ul") ||
|
| 664 | !strcmp(tag->name, "li") ||
|
| 665 | !strcmp(tag->name, "table") ||
|
| 666 | !strcmp(tag->name, "tr") ||
|
| 667 | (tag->name[0] == 'h' && isdigit((guchar)tag->name[1]))) { |
| 668 | if (!parser->newline) {
|
| 669 | parser->space = FALSE; |
| 670 | html_append_char(parser, '\n');
|
| 671 | } |
| 672 | parser->state = HTML_NORMAL; |
| 673 | } else if (!strcmp(tag->name, "/table") || |
| 674 | (tag->name[0] == '/' && |
| 675 | tag->name[1] == 'h' && |
| 676 | isdigit((guchar)tag->name[1]))) {
|
| 677 | if (!parser->empty_line) {
|
| 678 | parser->space = FALSE; |
| 679 | if (!parser->newline) html_append_char(parser, '\n'); |
| 680 | html_append_char(parser, '\n');
|
| 681 | } |
| 682 | parser->state = HTML_NORMAL; |
| 683 | } else if (!strcmp(tag->name, "/div") || |
| 684 | !strcmp(tag->name, "/ul") ||
|
| 685 | !strcmp(tag->name, "/li")) {
|
| 686 | if (!parser->newline) {
|
| 687 | parser->space = FALSE; |
| 688 | html_append_char(parser, '\n');
|
| 689 | } |
| 690 | parser->state = HTML_NORMAL; |
| 691 | } |
| 692 | |
| 693 | html_free_tag(tag); |
| 694 | |
| 695 | return parser->state;
|
| 696 | } |
| 697 | |
| 698 | static void html_parse_special(HTMLParser *parser) |
| 699 | {
|
| 700 | gchar symbol_name[9];
|
| 701 | gint n; |
| 702 | const gchar *val;
|
| 703 | |
| 704 | parser->state = HTML_UNKNOWN; |
| 705 | g_return_if_fail(*parser->bufp == '&');
|
| 706 | |
| 707 | /* &foo; */
|
| 708 | for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++) |
| 709 | ; |
| 710 | if (n > 7 || parser->bufp[n] != ';') { |
| 711 | /* output literal `&' */
|
| 712 | html_append_char(parser, *parser->bufp++); |
| 713 | parser->state = HTML_NORMAL; |
| 714 | return;
|
| 715 | } |
| 716 | strncpy2(symbol_name, parser->bufp, n + 2);
|
| 717 | parser->bufp += n + 1;
|
| 718 | |
| 719 | if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
|
| 720 | != NULL) {
|
| 721 | html_append_str(parser, val, -1);
|
| 722 | parser->state = HTML_NORMAL; |
| 723 | return;
|
| 724 | } else if (symbol_name[1] == '#' && isdigit((guchar)symbol_name[2])) { |
| 725 | gint ch; |
| 726 | |
| 727 | ch = atoi(symbol_name + 2);
|
| 728 | if ((ch > 0 && ch <= 127) || |
| 729 | (ch >= 128 && ch <= 255 && |
| 730 | parser->conv->charset == C_ISO_8859_1)) {
|
| 731 | html_append_char(parser, ch); |
| 732 | parser->state = HTML_NORMAL; |
| 733 | return;
|
| 734 | } |
| 735 | } |
| 736 | |
| 737 | html_append_str(parser, symbol_name, -1);
|
| 738 | } |
| 739 | |
| 740 | static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len) |
| 741 | {
|
| 742 | gchar *p; |
| 743 | |
| 744 | buf[0] = '\0'; |
| 745 | g_return_if_fail(*parser->bufp == '<');
|
| 746 | |
| 747 | /* ignore comment / CSS / script stuff */
|
| 748 | if (!strncmp(parser->bufp, "<!--", 4)) { |
| 749 | parser->bufp += 4;
|
| 750 | while ((p = strstr(parser->bufp, "-->")) == NULL) |
| 751 | if (html_read_line(parser) == HTML_EOF) return; |
| 752 | parser->bufp = p + 3;
|
| 753 | return;
|
| 754 | } |
| 755 | if (!g_strncasecmp(parser->bufp, "<style", 6)) { |
| 756 | parser->bufp += 6;
|
| 757 | while ((p = strcasestr(parser->bufp, "</style>")) == NULL) |
| 758 | if (html_read_line(parser) == HTML_EOF) return; |
| 759 | parser->bufp = p + 8;
|
| 760 | return;
|
| 761 | } |
| 762 | if (!g_strncasecmp(parser->bufp, "<script", 7)) { |
| 763 | parser->bufp += 7;
|
| 764 | while ((p = strcasestr(parser->bufp, "</script>")) == NULL) |
| 765 | if (html_read_line(parser) == HTML_EOF) return; |
| 766 | parser->bufp = p + 9;
|
| 767 | return;
|
| 768 | } |
| 769 | |
| 770 | parser->bufp++; |
| 771 | while ((p = strchr(parser->bufp, '>')) == NULL) |
| 772 | if (html_read_line(parser) == HTML_EOF) return; |
| 773 | |
| 774 | strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
|
| 775 | g_strstrip(buf); |
| 776 | parser->bufp = p + 1;
|
| 777 | } |