Statistics
| Branch: | Tag: | Revision:

root / lib / filters / wordsep-filter.c @ a88bc9d0

History | View | Annotate | Download (8.9 kB)

1
/* SylFilter - a message filter
2
 *
3
 * Copyright (C) 2011 Hiroyuki Yamamoto
4
 * Copyright (C) 2011 Sylpheed Development Team
5
 */
6
7
#include <glib.h>
8
#include <string.h>
9
10
#include "filter.h"
11
#include "wordsep-filter.h"
12
13
typedef enum
14
{
15
        UC_OTHER,
16
        UC_HIRA,
17
        UC_KATA,
18
        UC_HAN
19
} UCType;
20
21
#define MAX_TOKEN_LEN        42
22
23
#define URANGE(c, start, end) ((c) >= start && (c) <= end)
24
25
#define u_is_hira(c)        (URANGE(c, 0x3041, 0x3096) ||        \
26
                         URANGE(c, 0x309d, 0x309f))
27
#define u_is_kata(c)        (URANGE(c, 0x30a1, 0x30fa) ||        \
28
                         URANGE(c, 0x30fd, 0x30ff) ||        \
29
                         URANGE(c, 0x31f0, 0x31ff) ||        \
30
                         URANGE(c, 0xff66, 0xff6f) ||        \
31
                         URANGE(c, 0xff71, 0xff9d))
32
#define u_is_han(c)        (URANGE(c, 0x2e80, 0x2fdf) ||        \
33
                         (c) == 0x3005 ||                \
34
                         (c) == 0x3007 ||                \
35
                         URANGE(c, 0x3021, 0x3029) ||        \
36
                         URANGE(c, 0x3038, 0x303b) ||        \
37
                         URANGE(c, 0x3400, 0x4dbf) ||        \
38
                         URANGE(c, 0x4e00, 0x9fff) ||        \
39
                         URANGE(c, 0xf900, 0xfaff) ||        \
40
                         URANGE(c, 0x20000, 0x2ffff))
41
42
#define APPEND_STR(s, w)                        \
43
{                                                \
44
        if (s->len > 0)                                \
45
                g_string_append_c(s, ' ');        \
46
        if (is_url)                                \
47
                g_string_append(s, "Url*");        \
48
        g_string_append(s, w);                        \
49
}
50
51
52
static void append_trigram_str(GString *str, const char *word, gboolean is_url)
53
{
54
        const char *p = word;
55
        const char *bp;
56
        int len;
57
58
        xfilter_debug_print("append_trigram_str: %s\n", word);
59
60
        while (*p != '\0') {
61
                len = 0;
62
                bp = p;
63
                while (len < 3) {
64
                        p = g_utf8_next_char(p);
65
                        len++;
66
                        if (*p == '\0')
67
                                break;
68
                }
69
                if (len < 3)
70
                        break;
71
72
                if (str->len > 0)
73
                        g_string_append_c(str, ' ');
74
                if (is_url)
75
                        g_string_append(str, "Url*");
76
                g_string_append_len(str, bp, p - bp);
77
                xfilter_debug_print("trigram: %.*s\n", p - bp, bp);
78
79
                p = g_utf8_next_char(bp);
80
        }
81
}
82
83
static char *do_wordsep(const char *content, gboolean drop_punct,
84
                        gboolean drop_single_letter, gboolean drop_two_hira,
85
                        gboolean normalize, gboolean mb_trigram,
86
                        gboolean is_url, gboolean is_header)
87
{
88
        GString *out_str;
89
        const char *p = content;
90
        const char *bp = content;
91
        UCType prev_uctype = UC_OTHER;
92
        UCType prev2_uctype = UC_OTHER;
93
        GUnicodeType prev_utype = G_UNICODE_OTHER_LETTER;
94
        GUnicodeBreakType prev_btype;
95
        gboolean prev_is_wide = FALSE;
96
        gboolean prev_is_space = FALSE;
97
        gboolean prev_is_punct = FALSE;
98
        gboolean prev_is_digit = FALSE;
99
        gint word_len = 0;
100
101
        out_str = g_string_new("");
102
103
        while (*p != '\0') {
104
                gunichar wc;
105
                UCType uctype = UC_OTHER;
106
                GUnicodeType utype = G_UNICODE_OTHER_LETTER;
107
                GUnicodeBreakType btype;
108
                gboolean is_wide, is_space, is_punct, is_digit;
109
                gboolean do_break = FALSE;
110
                gboolean is_drop = FALSE;
111
                gboolean do_trigram = FALSE;
112
113
                if (!is_header && !is_url && p == bp && *p == 'h') {
114
                        if (!strncmp(p, "http://", 7))
115
                                p += 7;
116
                        else if (!strncmp(p, "https://", 8))
117
                                p += 8;
118
                        if (p > bp) {
119
                                if (!strncmp(p, "www.", 4))
120
                                        p += 4;
121
                        }
122
123
                        if (p > bp) {
124
                                const char *ep = p;
125
                                char *url_str, *sep_str;
126
127
                                while (*ep != '\0' && g_ascii_isgraph(*ep))
128
                                        ep++;
129
                                url_str = g_strndup(p, ep - p);
130
                                sep_str = do_wordsep(url_str, drop_punct, drop_single_letter, drop_two_hira, normalize, mb_trigram, TRUE, is_header);
131
                                if (out_str->len > 0)
132
                                        g_string_append_c(out_str, ' ');
133
                                g_string_append(out_str, sep_str);
134
                                g_free(url_str);
135
                                word_len = 0;
136
                                bp = p = ep;
137
                                continue;
138
                        }
139
                }
140
141
                wc = g_utf8_get_char(p);
142
                utype = g_unichar_type(wc);
143
                btype = g_unichar_break_type(wc);
144
                is_wide = g_unichar_iswide(wc) || wc == 0x25cb ||
145
                        URANGE(wc, 0x25ce, 0x25d1);
146
                is_space = g_unichar_isspace(wc);
147
                is_punct = (*p != '!' && *p != '$' && *p != '\'' && *p != '-')
148
                        && g_unichar_ispunct(wc);
149
                is_digit = g_unichar_isdigit(wc);
150
                if (is_wide) {
151
                        if (u_is_hira(wc))
152
                                uctype = UC_HIRA;
153
                        else if (u_is_kata(wc))
154
                                uctype = UC_KATA;
155
                        else if (u_is_han(wc))
156
                                uctype = UC_HAN;
157
                }
158
#if 0
159
                if (utype == G_UNICODE_DASH_PUNCTUATION ||
160
                    utype == G_UNICODE_OTHER_SYMBOL)
161
                        is_punct = FALSE;
162
#endif
163
                /* handle 1,234,567 / 192.168.0.1 */
164
                if (*p == '.' || *p == ',') {
165
                        if (prev_is_digit) {
166
                                const char *np;
167
                                gunichar nwc;
168
                                np = g_utf8_next_char(p);
169
                                nwc = g_utf8_get_char(np);
170
                                if (g_unichar_isdigit(nwc))
171
                                        is_punct = FALSE;
172
                        }
173
                }
174
175
#if 0
176
                {
177
                        gchar s[7] = {0};
178
                        g_unichar_to_utf8(wc, s);
179
                        xfilter_debug_print("%s: utype=%d word_len=%d\n", s, utype, word_len);
180
                }
181
#endif
182
183
                if (p > bp) {
184
                        if (prev_is_wide != is_wide)
185
                                do_break = TRUE;
186
                        else if (prev_is_punct != is_punct)
187
                                do_break = TRUE;
188
                        else if (is_space || is_punct)
189
                                do_break = TRUE;
190
                        else if (prev_uctype != UC_HAN && uctype == UC_HAN)
191
                                do_break = TRUE;
192
                        else if (prev_uctype == UC_HAN && uctype != UC_HAN)
193
                                do_break = TRUE;
194
                        else if (prev_uctype == UC_KATA &&
195
                                 uctype != UC_KATA &&
196
                                 utype != G_UNICODE_MODIFIER_LETTER &&
197
                                 utype != G_UNICODE_OTHER_SYMBOL)
198
                                do_break = TRUE;
199
                        else if (prev_uctype == UC_HIRA &&
200
                                 uctype != UC_HIRA &&
201
                                 utype != G_UNICODE_MODIFIER_LETTER &&
202
                                 utype != G_UNICODE_OTHER_SYMBOL)
203
                                do_break = TRUE;
204
                        else if (prev_uctype == UC_OTHER &&
205
                                 uctype != UC_OTHER &&
206
                                 prev_utype != G_UNICODE_MODIFIER_LETTER)
207
                                do_break = TRUE;
208
                        else if (prev_utype == G_UNICODE_MODIFIER_LETTER &&
209
                                 prev2_uctype != uctype)
210
                                do_break = TRUE;
211
                }
212
213
                if (drop_punct) {
214
                        is_drop = prev_is_punct;
215
                }
216
                if (drop_single_letter) {
217
                        if (word_len == 1 && do_break && prev_uctype != UC_HAN)
218
                                is_drop = TRUE;
219
                }
220
                if (drop_two_hira) {
221
                        if (word_len == 2 && do_break && prev2_uctype == UC_HIRA && prev_uctype == UC_HIRA)
222
                                is_drop = TRUE;
223
                }
224
225
                if (word_len > MAX_TOKEN_LEN && do_break)
226
                        is_drop = TRUE;
227
228
                if (mb_trigram && !is_drop && do_break &&
229
                    prev_uctype == UC_HAN && word_len > 3)
230
                        do_trigram = TRUE;
231
232
                prev2_uctype = prev_uctype;
233
                prev_uctype = uctype;
234
                prev_utype = utype;
235
                prev_btype = btype;
236
                prev_is_wide = is_wide;
237
                prev_is_space = is_space;
238
                prev_is_punct = is_punct;
239
                prev_is_digit = is_digit;
240
241
                if (do_break) {
242
                        gchar *word;
243
244
                        word_len = 0;
245
246
                        if (!is_drop) {
247
                                if (normalize)
248
                                        word = g_ascii_strdown(bp, p - bp);
249
                                else
250
                                        word = g_strndup(bp, p - bp);
251
                                if (do_trigram)
252
                                        append_trigram_str(out_str, word, is_url);
253
                                else
254
                                        APPEND_STR(out_str, word);
255
                                g_free(word);
256
                        }
257
258
                        if (is_space) {
259
                                do {
260
                                        p = g_utf8_next_char(p);
261
                                        wc = g_utf8_get_char(p);
262
                                } while (g_unichar_isspace(wc));
263
                                bp = p;
264
                                continue;
265
                        }
266
                        bp = p;
267
                } else {
268
                        if (is_space) {
269
                                word_len = 0;
270
                                do {
271
                                        p = g_utf8_next_char(p);
272
                                        wc = g_utf8_get_char(p);
273
                                } while (g_unichar_isspace(wc));
274
                                bp = p;
275
                                continue;
276
                        }
277
                }
278
279
                p = g_utf8_next_char(p);
280
                word_len++;
281
        }
282
283
        if (p > bp) {
284
                gchar *word;
285
                gboolean is_drop = FALSE;
286
                gboolean do_trigram = FALSE;
287
288
                if (drop_punct) {
289
                        is_drop = prev_is_punct;
290
                }
291
                if (drop_single_letter) {
292
                        if (word_len == 1 && prev_uctype != UC_HAN)
293
                                is_drop = TRUE;
294
                }
295
                if (drop_two_hira) {
296
                        if (word_len == 2 && prev2_uctype == UC_HIRA && prev_uctype == UC_HIRA)
297
                                is_drop = TRUE;
298
                }
299
300
                if (word_len > MAX_TOKEN_LEN)
301
                        is_drop = TRUE;
302
303
                if (mb_trigram && prev_uctype == UC_HAN && word_len > 3)
304
                        do_trigram = TRUE;
305
306
                if (!is_drop) {
307
                        if (normalize)
308
                                word = g_ascii_strdown(bp, -1);
309
                        else
310
                                word = g_strdup(bp);
311
                        if (do_trigram)
312
                                append_trigram_str(out_str, word, is_url);
313
                        else
314
                                APPEND_STR(out_str, word);
315
                        g_free(word);
316
                }
317
        }
318
319
        return g_string_free(out_str, FALSE);
320
}
321
322
static XFilterStatus xfilter_wordsep_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
323
{
324
        const char *mime_type;
325
        const char *content = NULL;
326
        char *processed_content;
327
        XMessageData *msgdata;
328
329
        g_return_val_if_fail(result != NULL, XF_ERROR);
330
331
        mime_type = xfilter_message_data_get_mime_type(data);
332
        if (!mime_type) {
333
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
334
                return XF_UNSUPPORTED_TYPE;
335
        }
336
337
        if (!g_strncasecmp(mime_type, "text/", 5))
338
                content = xfilter_message_data_get_content(data);
339
        else {
340
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
341
                return XF_UNSUPPORTED_TYPE;
342
        }
343
344
        processed_content = do_wordsep(content, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE);
345
346
        msgdata = xfilter_message_data_new(NULL, mime_type);
347
        xfilter_message_data_set_content(msgdata, processed_content);
348
349
#define WORDSEP_HEADER(attr)                                                \
350
        content = xfilter_message_data_get_attribute(data, attr);        \
351
        if (content) {                                                        \
352
                xfilter_debug_print("header: %s\n", content);                \
353
                processed_content =                                        \
354
                        do_wordsep(content, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE); \
355
                xfilter_message_data_set_attribute                        \
356
                        (msgdata, attr, processed_content, FALSE);        \
357
                g_free(processed_content);                                \
358
        }
359
360
        WORDSEP_HEADER(XM_FROM);
361
        WORDSEP_HEADER(XM_TO);
362
        WORDSEP_HEADER(XM_CC);
363
        WORDSEP_HEADER(XM_SUBJECT);
364
365
#undef WORDSEP_HEADER
366
367
        xfilter_result_set_message_data(result, msgdata);
368
369
        xfilter_result_set_status(result, XF_REWRITTEN);
370
371
        return XF_REWRITTEN;
372
}
373
374
XFilter *xfilter_wordsep_new(void)
375
{
376
        XFilter *filter;
377
378
        filter = xfilter_new(XF_CONTENT, "wordsep");
379
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter),
380
                                        xfilter_wordsep_func);
381
382
        return filter;
383
}