Statistics
| Branch: | Tag: | Revision:

root / lib / filters / wordsep-filter.c @ 2d9cf61c

History | View | Annotate | Download (8.9 kB)

1
/* SylFilter - a message filter
2
 *
3
 * Copyright (C) 2011 Hiroyuki Yamamoto
4
 * Copyright (C) 2011 Sylpheed Development Team
5
 */
6
7
#include <glib.h>
8
#include <string.h>
9
10
#include "filter.h"
11
#include "wordsep-filter.h"
12
13
typedef enum
14
{
15
        UC_OTHER,
16
        UC_HIRA,
17
        UC_KATA,
18
        UC_HAN
19
} UCType;
20
21
#define MAX_TOKEN_LEN        42
22
#define NGRAM_LEN        4        /* 4-gram */
23
24
#define URANGE(c, start, end) ((c) >= start && (c) <= end)
25
26
#define u_is_hira(c)        (URANGE(c, 0x3041, 0x3096) ||        \
27
                         URANGE(c, 0x309d, 0x309f))
28
#define u_is_kata(c)        (URANGE(c, 0x30a1, 0x30fa) ||        \
29
                         URANGE(c, 0x30fd, 0x30ff) ||        \
30
                         URANGE(c, 0x31f0, 0x31ff) ||        \
31
                         URANGE(c, 0xff66, 0xff6f) ||        \
32
                         URANGE(c, 0xff71, 0xff9d))
33
#define u_is_han(c)        (URANGE(c, 0x2e80, 0x2fdf) ||        \
34
                         (c) == 0x3005 ||                \
35
                         (c) == 0x3007 ||                \
36
                         URANGE(c, 0x3021, 0x3029) ||        \
37
                         URANGE(c, 0x3038, 0x303b) ||        \
38
                         URANGE(c, 0x3400, 0x4dbf) ||        \
39
                         URANGE(c, 0x4e00, 0x9fff) ||        \
40
                         URANGE(c, 0xf900, 0xfaff) ||        \
41
                         URANGE(c, 0x20000, 0x2ffff))
42
43
#define APPEND_STR(s, w)                        \
44
{                                                \
45
        if (s->len > 0)                                \
46
                g_string_append_c(s, ' ');        \
47
        if (is_url)                                \
48
                g_string_append(s, "Url*");        \
49
        g_string_append(s, w);                        \
50
}
51
52
53
static void append_ngram_str(GString *str, const char *word, gboolean is_url)
54
{
55
        const char *p = word;
56
        const char *bp;
57
        int len;
58
59
        xfilter_debug_print("append_ngram_str: %s\n", word);
60
61
        while (*p != '\0') {
62
                len = 0;
63
                bp = p;
64
                while (len < NGRAM_LEN) {
65
                        p = g_utf8_next_char(p);
66
                        len++;
67
                        if (*p == '\0')
68
                                break;
69
                }
70
                if (len < NGRAM_LEN)
71
                        break;
72
73
                if (str->len > 0)
74
                        g_string_append_c(str, ' ');
75
                if (is_url)
76
                        g_string_append(str, "Url*");
77
                g_string_append_len(str, bp, p - bp);
78
                xfilter_debug_print("n-gram: %.*s\n", p - bp, bp);
79
80
                p = g_utf8_next_char(bp);
81
        }
82
}
83
84
static char *do_wordsep(const char *content, gboolean drop_punct,
85
                        gboolean drop_single_letter, gboolean drop_two_hira,
86
                        gboolean normalize, gboolean mb_ngram,
87
                        gboolean is_url, gboolean is_header)
88
{
89
        GString *out_str;
90
        const char *p = content;
91
        const char *bp = content;
92
        UCType prev_uctype = UC_OTHER;
93
        UCType prev2_uctype = UC_OTHER;
94
        GUnicodeType prev_utype = G_UNICODE_OTHER_LETTER;
95
        GUnicodeBreakType prev_btype;
96
        gboolean prev_is_wide = FALSE;
97
        gboolean prev_is_space = FALSE;
98
        gboolean prev_is_punct = FALSE;
99
        gboolean prev_is_digit = FALSE;
100
        gint word_len = 0;
101
102
        out_str = g_string_new("");
103
104
        while (*p != '\0') {
105
                gunichar wc;
106
                UCType uctype = UC_OTHER;
107
                GUnicodeType utype = G_UNICODE_OTHER_LETTER;
108
                GUnicodeBreakType btype;
109
                gboolean is_wide, is_space, is_punct, is_digit;
110
                gboolean do_break = FALSE;
111
                gboolean is_drop = FALSE;
112
                gboolean do_ngram = FALSE;
113
114
                if (!is_header && !is_url && p == bp && *p == 'h') {
115
                        if (!strncmp(p, "http://", 7))
116
                                p += 7;
117
                        else if (!strncmp(p, "https://", 8))
118
                                p += 8;
119
                        if (p > bp) {
120
                                if (!strncmp(p, "www.", 4))
121
                                        p += 4;
122
                        }
123
124
                        if (p > bp) {
125
                                const char *ep = p;
126
                                char *url_str, *sep_str;
127
128
                                while (*ep != '\0' && g_ascii_isgraph(*ep))
129
                                        ep++;
130
                                url_str = g_strndup(p, ep - p);
131
                                sep_str = do_wordsep(url_str, drop_punct, drop_single_letter, drop_two_hira, normalize, mb_ngram, TRUE, is_header);
132
                                if (out_str->len > 0)
133
                                        g_string_append_c(out_str, ' ');
134
                                g_string_append(out_str, sep_str);
135
                                g_free(url_str);
136
                                word_len = 0;
137
                                bp = p = ep;
138
                                continue;
139
                        }
140
                }
141
142
                wc = g_utf8_get_char(p);
143
                utype = g_unichar_type(wc);
144
                btype = g_unichar_break_type(wc);
145
                is_wide = g_unichar_iswide(wc) || wc == 0x25cb ||
146
                        URANGE(wc, 0x25ce, 0x25d1);
147
                is_space = g_unichar_isspace(wc);
148
                is_punct = (*p != '!' && *p != '$' && *p != '\'' && *p != '-')
149
                        && g_unichar_ispunct(wc);
150
                is_digit = g_unichar_isdigit(wc);
151
                if (is_wide) {
152
                        if (u_is_hira(wc))
153
                                uctype = UC_HIRA;
154
                        else if (u_is_kata(wc))
155
                                uctype = UC_KATA;
156
                        else if (u_is_han(wc))
157
                                uctype = UC_HAN;
158
                }
159
#if 0
160
                if (utype == G_UNICODE_DASH_PUNCTUATION ||
161
                    utype == G_UNICODE_OTHER_SYMBOL)
162
                        is_punct = FALSE;
163
#endif
164
                /* handle 1,234,567 / 192.168.0.1 */
165
                if (*p == '.' || *p == ',') {
166
                        if (prev_is_digit) {
167
                                const char *np;
168
                                gunichar nwc;
169
                                np = g_utf8_next_char(p);
170
                                nwc = g_utf8_get_char(np);
171
                                if (g_unichar_isdigit(nwc))
172
                                        is_punct = FALSE;
173
                        }
174
                }
175
176
#if 0
177
                {
178
                        gchar s[7] = {0};
179
                        g_unichar_to_utf8(wc, s);
180
                        xfilter_debug_print("%s: utype=%d word_len=%d\n", s, utype, word_len);
181
                }
182
#endif
183
184
                if (p > bp) {
185
                        if (prev_is_wide != is_wide)
186
                                do_break = TRUE;
187
                        else if (prev_is_punct != is_punct)
188
                                do_break = TRUE;
189
                        else if (is_space || is_punct)
190
                                do_break = TRUE;
191
                        else if (prev_uctype != UC_HAN && uctype == UC_HAN)
192
                                do_break = TRUE;
193
                        else if (prev_uctype == UC_HAN && uctype != UC_HAN)
194
                                do_break = TRUE;
195
                        else if (prev_uctype == UC_KATA &&
196
                                 uctype != UC_KATA &&
197
                                 utype != G_UNICODE_MODIFIER_LETTER &&
198
                                 utype != G_UNICODE_OTHER_SYMBOL)
199
                                do_break = TRUE;
200
                        else if (prev_uctype == UC_HIRA &&
201
                                 uctype != UC_HIRA &&
202
                                 utype != G_UNICODE_MODIFIER_LETTER &&
203
                                 utype != G_UNICODE_OTHER_SYMBOL)
204
                                do_break = TRUE;
205
                        else if (prev_uctype == UC_OTHER &&
206
                                 uctype != UC_OTHER &&
207
                                 prev_utype != G_UNICODE_MODIFIER_LETTER)
208
                                do_break = TRUE;
209
                        else if (prev_utype == G_UNICODE_MODIFIER_LETTER &&
210
                                 prev2_uctype != uctype)
211
                                do_break = TRUE;
212
                }
213
214
                if (drop_punct) {
215
                        is_drop = prev_is_punct;
216
                }
217
                if (drop_single_letter) {
218
                        if (word_len == 1 && do_break && prev_uctype != UC_HAN)
219
                                is_drop = TRUE;
220
                }
221
                if (drop_two_hira) {
222
                        if (word_len == 2 && do_break && prev2_uctype == UC_HIRA && prev_uctype == UC_HIRA)
223
                                is_drop = TRUE;
224
                }
225
226
                if (word_len > MAX_TOKEN_LEN && do_break)
227
                        is_drop = TRUE;
228
229
                if (mb_ngram && !is_drop && do_break &&
230
                    prev_uctype == UC_HAN && word_len > NGRAM_LEN)
231
                        do_ngram = TRUE;
232
233
                prev2_uctype = prev_uctype;
234
                prev_uctype = uctype;
235
                prev_utype = utype;
236
                prev_btype = btype;
237
                prev_is_wide = is_wide;
238
                prev_is_space = is_space;
239
                prev_is_punct = is_punct;
240
                prev_is_digit = is_digit;
241
242
                if (do_break) {
243
                        gchar *word;
244
245
                        word_len = 0;
246
247
                        if (!is_drop) {
248
                                if (normalize)
249
                                        word = g_ascii_strdown(bp, p - bp);
250
                                else
251
                                        word = g_strndup(bp, p - bp);
252
                                if (do_ngram)
253
                                        append_ngram_str(out_str, word, is_url);
254
                                else
255
                                        APPEND_STR(out_str, word);
256
                                g_free(word);
257
                        }
258
259
                        if (is_space) {
260
                                do {
261
                                        p = g_utf8_next_char(p);
262
                                        wc = g_utf8_get_char(p);
263
                                } while (g_unichar_isspace(wc));
264
                                bp = p;
265
                                continue;
266
                        }
267
                        bp = p;
268
                } else {
269
                        if (is_space) {
270
                                word_len = 0;
271
                                do {
272
                                        p = g_utf8_next_char(p);
273
                                        wc = g_utf8_get_char(p);
274
                                } while (g_unichar_isspace(wc));
275
                                bp = p;
276
                                continue;
277
                        }
278
                }
279
280
                p = g_utf8_next_char(p);
281
                word_len++;
282
        }
283
284
        if (p > bp) {
285
                gchar *word;
286
                gboolean is_drop = FALSE;
287
                gboolean do_ngram = FALSE;
288
289
                if (drop_punct) {
290
                        is_drop = prev_is_punct;
291
                }
292
                if (drop_single_letter) {
293
                        if (word_len == 1 && prev_uctype != UC_HAN)
294
                                is_drop = TRUE;
295
                }
296
                if (drop_two_hira) {
297
                        if (word_len == 2 && prev2_uctype == UC_HIRA && prev_uctype == UC_HIRA)
298
                                is_drop = TRUE;
299
                }
300
301
                if (word_len > MAX_TOKEN_LEN)
302
                        is_drop = TRUE;
303
304
                if (mb_ngram && prev_uctype == UC_HAN && word_len > NGRAM_LEN)
305
                        do_ngram = TRUE;
306
307
                if (!is_drop) {
308
                        if (normalize)
309
                                word = g_ascii_strdown(bp, -1);
310
                        else
311
                                word = g_strdup(bp);
312
                        if (do_ngram)
313
                                append_ngram_str(out_str, word, is_url);
314
                        else
315
                                APPEND_STR(out_str, word);
316
                        g_free(word);
317
                }
318
        }
319
320
        return g_string_free(out_str, FALSE);
321
}
322
323
static XFilterStatus xfilter_wordsep_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
324
{
325
        const char *mime_type;
326
        const char *content = NULL;
327
        char *processed_content;
328
        XMessageData *msgdata;
329
330
        g_return_val_if_fail(result != NULL, XF_ERROR);
331
332
        mime_type = xfilter_message_data_get_mime_type(data);
333
        if (!mime_type) {
334
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
335
                return XF_UNSUPPORTED_TYPE;
336
        }
337
338
        if (!g_strncasecmp(mime_type, "text/", 5))
339
                content = xfilter_message_data_get_content(data);
340
        else {
341
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
342
                return XF_UNSUPPORTED_TYPE;
343
        }
344
345
        processed_content = do_wordsep(content, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE);
346
347
        msgdata = xfilter_message_data_new(NULL, mime_type);
348
        xfilter_message_data_set_content(msgdata, processed_content);
349
350
#define WORDSEP_HEADER(attr)                                                \
351
        content = xfilter_message_data_get_attribute(data, attr);        \
352
        if (content) {                                                        \
353
                xfilter_debug_print("header: %s\n", content);                \
354
                processed_content =                                        \
355
                        do_wordsep(content, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE); \
356
                xfilter_message_data_set_attribute                        \
357
                        (msgdata, attr, processed_content, FALSE);        \
358
                g_free(processed_content);                                \
359
        }
360
361
        WORDSEP_HEADER(XM_FROM);
362
        WORDSEP_HEADER(XM_TO);
363
        WORDSEP_HEADER(XM_CC);
364
        WORDSEP_HEADER(XM_SUBJECT);
365
        WORDSEP_HEADER(XM_RECEIVED);
366
367
#undef WORDSEP_HEADER
368
369
        xfilter_result_set_message_data(result, msgdata);
370
371
        xfilter_result_set_status(result, XF_REWRITTEN);
372
373
        return XF_REWRITTEN;
374
}
375
376
XFilter *xfilter_wordsep_new(void)
377
{
378
        XFilter *filter;
379
380
        filter = xfilter_new(XF_CONTENT, "wordsep");
381
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter),
382
                                        xfilter_wordsep_func);
383
384
        return filter;
385
}