Statistics
| Branch: | Tag: | Revision:

root / lib / filters / wordsep-filter.c @ aebfd4cc

History | View | Annotate | Download (7.74 KB)

1
/* SylFilter - a message filter
2
 *
3
 * Copyright (C) 2011 Hiroyuki Yamamoto
4
 * Copyright (C) 2011 Sylpheed Development Team
5
 */
6

    
7
#include <glib.h>
8
#include <string.h>
9

    
10
#include "filter.h"
11
#include "wordsep-filter.h"
12

    
13
typedef enum
14
{
15
        UC_OTHER,
16
        UC_HIRA,
17
        UC_KATA,
18
        UC_HAN
19
} UCType;
20

    
21
#define URANGE(c, start, end) ((c) >= start && (c) <= end)
22

    
23
#define u_is_hira(c)        (URANGE(c, 0x3041, 0x3096) ||        \
24
                         URANGE(c, 0x309d, 0x309f))
25
#define u_is_kata(c)        (URANGE(c, 0x30a1, 0x30fa) ||        \
26
                         URANGE(c, 0x30fd, 0x30ff) ||        \
27
                         URANGE(c, 0x31f0, 0x31ff) ||        \
28
                         URANGE(c, 0xff66, 0xff6f) ||        \
29
                         URANGE(c, 0xff71, 0xff9d))
30
#define u_is_han(c)        (URANGE(c, 0x2e80, 0x2fdf) ||        \
31
                         (c) == 0x3005 ||                \
32
                         (c) == 0x3007 ||                \
33
                         URANGE(c, 0x3021, 0x3029) ||        \
34
                         URANGE(c, 0x3038, 0x303b) ||        \
35
                         URANGE(c, 0x3400, 0x4dbf) ||        \
36
                         URANGE(c, 0x4e00, 0x9fff) ||        \
37
                         URANGE(c, 0xf900, 0xfaff) ||        \
38
                         URANGE(c, 0x20000, 0x2ffff))
39

    
40
#define APPEND_STR(s, w)                        \
41
{                                                \
42
        if (s->len > 0)                                \
43
                g_string_append_c(s, ' ');        \
44
        if (is_url)                                \
45
                g_string_append(s, "Url*");        \
46
        g_string_append(s, w);                        \
47
}
48

    
49
static char *do_wordsep(const char *content, gboolean drop_punct,
50
                        gboolean drop_single_letter, gboolean drop_two_hira,
51
                        gboolean normalize, gboolean is_url, gboolean is_header)
52
{
53
        GString *out_str;
54
        const char *p = content;
55
        const char *bp = content;
56
        UCType prev_uctype = UC_OTHER;
57
        UCType prev2_uctype = UC_OTHER;
58
        GUnicodeType prev_utype = G_UNICODE_OTHER_LETTER;
59
        GUnicodeBreakType prev_btype;
60
        gboolean prev_is_wide = FALSE;
61
        gboolean prev_is_space = FALSE;
62
        gboolean prev_is_punct = FALSE;
63
        gboolean prev_is_digit = FALSE;
64
        gint word_len = 0;
65

    
66
        out_str = g_string_new("");
67

    
68
        while (*p != '\0') {
69
                gunichar wc;
70
                UCType uctype = UC_OTHER;
71
                GUnicodeType utype = G_UNICODE_OTHER_LETTER;
72
                GUnicodeBreakType btype;
73
                gboolean is_wide, is_space, is_punct, is_digit;
74
                gboolean do_break = FALSE;
75
                gboolean is_drop = FALSE;
76

    
77
                if (!is_header && !is_url && p == bp && *p == 'h') {
78
                        if (!strncmp(p, "http://", 7))
79
                                p += 7;
80
                        else if (!strncmp(p, "https://", 8))
81
                                p += 8;
82
                        if (p > bp) {
83
                                if (!strncmp(p, "www.", 4))
84
                                        p += 4;
85
                        }
86

    
87
                        if (p > bp) {
88
                                const char *ep = p;
89
                                char *url_str, *sep_str;
90

    
91
                                while (*ep != '\0' && g_ascii_isgraph(*ep))
92
                                        ep++;
93
                                url_str = g_strndup(p, ep - p);
94
                                sep_str = do_wordsep(url_str, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE);
95
                                if (out_str->len > 0)
96
                                        g_string_append_c(out_str, ' ');
97
                                g_string_append(out_str, sep_str);
98
                                g_free(url_str);
99
                                word_len = 0;
100
                                bp = p = ep;
101
                                continue;
102
                        }
103
                }
104

    
105
                wc = g_utf8_get_char(p);
106
                utype = g_unichar_type(wc);
107
                btype = g_unichar_break_type(wc);
108
                is_wide = g_unichar_iswide(wc) || wc == 0x25cb ||
109
                        URANGE(wc, 0x25ce, 0x25d1);
110
                is_space = g_unichar_isspace(wc);
111
                is_punct = (*p != '!' && *p != '$' && *p != '\'' && *p != '-')
112
                        && g_unichar_ispunct(wc);
113
                is_digit = g_unichar_isdigit(wc);
114
                if (is_wide) {
115
                        if (u_is_hira(wc))
116
                                uctype = UC_HIRA;
117
                        else if (u_is_kata(wc))
118
                                uctype = UC_KATA;
119
                        else if (u_is_han(wc))
120
                                uctype = UC_HAN;
121
                }
122
#if 0
123
                if (utype == G_UNICODE_DASH_PUNCTUATION ||
124
                    utype == G_UNICODE_OTHER_SYMBOL)
125
                        is_punct = FALSE;
126
#endif
127
                /* handle 1,234,567 / 192.168.0.1 */
128
                if (*p == '.' || *p == ',') {
129
                        if (prev_is_digit) {
130
                                const char *np;
131
                                gunichar nwc;
132
                                np = g_utf8_next_char(p);
133
                                nwc = g_utf8_get_char(np);
134
                                if (g_unichar_isdigit(nwc))
135
                                        is_punct = FALSE;
136
                        }
137
                }
138

    
139
#if 0
140
                {
141
                        gchar s[7] = {0};
142
                        g_unichar_to_utf8(wc, s);
143
                        xfilter_debug_print("%s: utype=%d word_len=%d\n", s, utype, word_len);
144
                }
145
#endif
146

    
147
                if (p > bp) {
148
                        if (prev_is_wide != is_wide)
149
                                do_break = TRUE;
150
                        else if (prev_is_punct != is_punct)
151
                                do_break = TRUE;
152
                        else if (is_space || is_punct)
153
                                do_break = TRUE;
154
                        else if (prev_uctype != UC_HAN && uctype == UC_HAN)
155
                                do_break = TRUE;
156
                        else if (prev_uctype == UC_HAN && uctype != UC_HAN)
157
                                do_break = TRUE;
158
                        else if (prev_uctype == UC_KATA &&
159
                                 uctype != UC_KATA &&
160
                                 utype != G_UNICODE_MODIFIER_LETTER &&
161
                                 utype != G_UNICODE_OTHER_SYMBOL)
162
                                do_break = TRUE;
163
                        else if (prev_uctype == UC_HIRA &&
164
                                 uctype != UC_HIRA &&
165
                                 utype != G_UNICODE_MODIFIER_LETTER &&
166
                                 utype != G_UNICODE_OTHER_SYMBOL)
167
                                do_break = TRUE;
168
                        else if (prev_uctype == UC_OTHER &&
169
                                 uctype != UC_OTHER &&
170
                                 prev_utype != G_UNICODE_MODIFIER_LETTER)
171
                                do_break = TRUE;
172
                        else if (prev_utype == G_UNICODE_MODIFIER_LETTER &&
173
                                 prev2_uctype != uctype)
174
                                do_break = TRUE;
175
                }
176

    
177
                if (drop_punct) {
178
                        is_drop = prev_is_punct;
179
                }
180
                if (drop_single_letter) {
181
                        if (word_len == 1 && do_break && prev_uctype != UC_HAN)
182
                                is_drop = TRUE;
183
                }
184
                if (drop_two_hira) {
185
                        if (word_len == 2 && do_break && prev2_uctype == UC_HIRA && prev_uctype == UC_HIRA)
186
                                is_drop = TRUE;
187
                }
188

    
189
                prev2_uctype = prev_uctype;
190
                prev_uctype = uctype;
191
                prev_utype = utype;
192
                prev_btype = btype;
193
                prev_is_wide = is_wide;
194
                prev_is_space = is_space;
195
                prev_is_punct = is_punct;
196
                prev_is_digit = is_digit;
197

    
198
                if (do_break) {
199
                        gchar *word;
200

    
201
                        word_len = 0;
202

    
203
                        if (!is_drop) {
204
                                if (normalize)
205
                                        word = g_ascii_strdown(bp, p - bp);
206
                                else
207
                                        word = g_strndup(bp, p - bp);
208
                                g_strstrip(word);
209
                                if (*word == '\0') {
210
                                        g_free(word);
211
                                        bp = p;
212
                                        p = g_utf8_next_char(p);
213
                                        continue;
214
                                }
215

    
216
                                APPEND_STR(out_str, word);
217
                                g_free(word);
218
                        }
219

    
220
                        if (is_space) {
221
                                do {
222
                                        p = g_utf8_next_char(p);
223
                                        wc = g_utf8_get_char(p);
224
                                } while (g_unichar_isspace(wc));
225
                                bp = p;
226
                                continue;
227
                        }
228
                        bp = p;
229
                } else {
230
                        if (is_space) {
231
                                word_len = 0;
232
                                do {
233
                                        p = g_utf8_next_char(p);
234
                                        wc = g_utf8_get_char(p);
235
                                } while (g_unichar_isspace(wc));
236
                                bp = p;
237
                                continue;
238
                        }
239
                }
240

    
241
                p = g_utf8_next_char(p);
242
                word_len++;
243
        }
244

    
245
        if (p > bp) {
246
                gchar *word;
247
                gboolean is_drop = FALSE;
248

    
249
                if (drop_punct) {
250
                        is_drop = prev_is_punct;
251
                }
252
                if (drop_single_letter) {
253
                        if (word_len == 1 && prev_uctype != UC_HAN)
254
                                is_drop = TRUE;
255
                }
256

    
257
                if (!is_drop) {
258
                        if (normalize)
259
                                word = g_ascii_strdown(bp, -1);
260
                        else
261
                                word = g_strdup(bp);
262
                        g_strstrip(word);
263
                        if (*word != '\0') {
264
                                APPEND_STR(out_str, word);
265
                        }
266
                        g_free(word);
267
                }
268
        }
269

    
270
        return g_string_free(out_str, FALSE);
271
}
272

    
273
static XFilterStatus xfilter_wordsep_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
274
{
275
        const char *mime_type;
276
        const char *content = NULL;
277
        char *processed_content;
278
        XMessageData *msgdata;
279

    
280
        g_return_val_if_fail(result != NULL, XF_ERROR);
281

    
282
        mime_type = xfilter_message_data_get_mime_type(data);
283
        if (!mime_type) {
284
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
285
                return XF_UNSUPPORTED_TYPE;
286
        }
287

    
288
        if (!g_strncasecmp(mime_type, "text/", 5))
289
                content = xfilter_message_data_get_content(data);
290
        else {
291
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
292
                return XF_UNSUPPORTED_TYPE;
293
        }
294

    
295
        processed_content = do_wordsep(content, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE);
296

    
297
        msgdata = xfilter_message_data_new(NULL, mime_type);
298
        xfilter_message_data_set_content(msgdata, processed_content);
299

    
300
#define WORDSEP_HEADER(attr)                                                \
301
        content = xfilter_message_data_get_attribute(data, attr);        \
302
        if (content) {                                                        \
303
                xfilter_debug_print("header: %s\n", content);                \
304
                processed_content =                                        \
305
                        do_wordsep(content, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE); \
306
                xfilter_message_data_set_attribute                        \
307
                        (msgdata, attr, processed_content, FALSE);        \
308
                g_free(processed_content);                                \
309
        }
310

    
311
        WORDSEP_HEADER(XM_FROM);
312
        WORDSEP_HEADER(XM_TO);
313
        WORDSEP_HEADER(XM_CC);
314
        WORDSEP_HEADER(XM_SUBJECT);
315

    
316
#undef WORDSEP_HEADER
317

    
318
        xfilter_result_set_message_data(result, msgdata);
319

    
320
        xfilter_result_set_status(result, XF_REWRITTEN);
321

    
322
        return XF_REWRITTEN;
323
}
324

    
325
XFilter *xfilter_wordsep_new(void)
326
{
327
        XFilter *filter;
328

    
329
        filter = xfilter_new(XF_CONTENT, "wordsep");
330
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter),
331
                                        xfilter_wordsep_func);
332

    
333
        return filter;
334
}