Statistics
| Branch: | Tag: | Revision:

root / lib / filters / wordsep-filter.c @ 940d391b

History | View | Annotate | Download (6.7 kB)

1
#include <glib.h>
2
3
#include "filter.h"
4
#include "wordsep-filter.h"
5
6
typedef enum
7
{
8
        UC_OTHER,
9
        UC_HIRA,
10
        UC_KATA,
11
        UC_HAN
12
} UCType;
13
14
#define URANGE(c, start, end) ((c) >= start && (c) <= end)
15
16
#define u_is_hira(c)        (URANGE(c, 0x3041, 0x3096) ||        \
17
                         URANGE(c, 0x309d, 0x309f))
18
#define u_is_kata(c)        (URANGE(c, 0x30a1, 0x30fa) ||        \
19
                         URANGE(c, 0x30fd, 0x30ff) ||        \
20
                         URANGE(c, 0x31f0, 0x31ff) ||        \
21
                         URANGE(c, 0xff66, 0xff6f) ||        \
22
                         URANGE(c, 0xff71, 0xff9d))
23
#define u_is_han(c)        (URANGE(c, 0x2e80, 0x2fdf) ||        \
24
                         (c) == 0x3005 ||                \
25
                         (c) == 0x3007 ||                \
26
                         URANGE(c, 0x3021, 0x3029) ||        \
27
                         URANGE(c, 0x3038, 0x303b) ||        \
28
                         URANGE(c, 0x3400, 0x4dbf) ||        \
29
                         URANGE(c, 0x4e00, 0x9fff) ||        \
30
                         URANGE(c, 0xf900, 0xfaff) ||        \
31
                         URANGE(c, 0x20000, 0x2ffff))
32
33
static char *do_wordsep(const char *content, gboolean drop_punct,
34
                        gboolean drop_single_letter, gboolean drop_two_hira,
35
                        gboolean normalize)
36
{
37
        GString *out_str;
38
        const char *p = content;
39
        const char *bp = content;
40
        UCType prev_uctype = UC_OTHER;
41
        UCType prev2_uctype = UC_OTHER;
42
        GUnicodeType prev_utype = G_UNICODE_OTHER_LETTER;
43
        GUnicodeBreakType prev_btype;
44
        gboolean prev_is_wide = FALSE;
45
        gboolean prev_is_space = FALSE;
46
        gboolean prev_is_punct = FALSE;
47
        gboolean prev_is_digit = FALSE;
48
        gint word_len = 0;
49
50
        out_str = g_string_new("");
51
52
        while (*p != '\0') {
53
                gunichar wc;
54
                UCType uctype = UC_OTHER;
55
                GUnicodeType utype = G_UNICODE_OTHER_LETTER;
56
                GUnicodeBreakType btype;
57
                gboolean is_wide, is_space, is_punct, is_digit;
58
                gboolean do_break = FALSE;
59
                gboolean is_drop = FALSE;
60
61
                wc = g_utf8_get_char(p);
62
                utype = g_unichar_type(wc);
63
                btype = g_unichar_break_type(wc);
64
                is_wide = g_unichar_iswide(wc) || wc == 0x25cb ||
65
                        URANGE(wc, 0x25ce, 0x25d1);
66
                is_space = g_unichar_isspace(wc);
67
                is_punct = (*p != '!' && *p != '$' && *p != '\'' && *p != '-')
68
                        && g_unichar_ispunct(wc);
69
                is_digit = g_unichar_isdigit(wc);
70
                if (is_wide) {
71
                        if (u_is_hira(wc))
72
                                uctype = UC_HIRA;
73
                        else if (u_is_kata(wc))
74
                                uctype = UC_KATA;
75
                        else if (u_is_han(wc))
76
                                uctype = UC_HAN;
77
                }
78
#if 0
79
                if (utype == G_UNICODE_DASH_PUNCTUATION ||
80
                    utype == G_UNICODE_OTHER_SYMBOL)
81
                        is_punct = FALSE;
82
#endif
83
                /* handle 1,234,567 / 192.168.0.1 */
84
                if (*p == '.' || *p == ',') {
85
                        if (prev_is_digit) {
86
                                const char *np;
87
                                gunichar nwc;
88
                                np = g_utf8_next_char(p);
89
                                nwc = g_utf8_get_char(np);
90
                                if (g_unichar_isdigit(nwc))
91
                                        is_punct = FALSE;
92
                        }
93
                }
94
95
//#if 0
96
                {
97
                        gchar s[7] = {0};
98
                        g_unichar_to_utf8(wc, s);
99
                        g_print("%s: utype=%d word_len=%d\n", s, utype, word_len);
100
                }
101
//#endif
102
103
                if (p > bp) {
104
                        if (prev_is_wide != is_wide)
105
                                do_break = TRUE;
106
                        else if (prev_is_punct != is_punct)
107
                                do_break = TRUE;
108
                        else if (is_space || is_punct)
109
                                do_break = TRUE;
110
                        else if (prev_uctype != UC_HAN && uctype == UC_HAN)
111
                                do_break = TRUE;
112
                        else if (prev_uctype == UC_HAN && uctype != UC_HAN)
113
                                do_break = TRUE;
114
                        else if (prev_uctype == UC_KATA &&
115
                                 uctype != UC_KATA &&
116
                                 utype != G_UNICODE_MODIFIER_LETTER &&
117
                                 utype != G_UNICODE_OTHER_SYMBOL)
118
                                do_break = TRUE;
119
                        else if (prev_uctype == UC_HIRA &&
120
                                 uctype != UC_HIRA &&
121
                                 utype != G_UNICODE_MODIFIER_LETTER &&
122
                                 utype != G_UNICODE_OTHER_SYMBOL)
123
                                do_break = TRUE;
124
                        else if (prev_uctype == UC_OTHER &&
125
                                 uctype != UC_OTHER &&
126
                                 prev_utype != G_UNICODE_MODIFIER_LETTER)
127
                                do_break = TRUE;
128
                }
129
130
                if (drop_punct) {
131
                        is_drop = prev_is_punct;
132
                }
133
                if (drop_single_letter) {
134
                        if (word_len == 1 && do_break && prev_uctype != UC_HAN)
135
                                is_drop = TRUE;
136
                }
137
                if (drop_two_hira) {
138
                        if (word_len == 2 && do_break && prev2_uctype == UC_HIRA && prev_uctype == UC_HIRA)
139
                                is_drop = TRUE;
140
                }
141
142
                prev2_uctype = prev_uctype;
143
                prev_uctype = uctype;
144
                prev_utype = utype;
145
                prev_btype = btype;
146
                prev_is_wide = is_wide;
147
                prev_is_space = is_space;
148
                prev_is_punct = is_punct;
149
                prev_is_digit = is_digit;
150
151
                if (do_break) {
152
                        gchar *word;
153
154
                        word_len = 0;
155
156
                        if (!is_drop) {
157
                                if (normalize)
158
                                        word = g_ascii_strdown(bp, p - bp);
159
                                else
160
                                        word = g_strndup(bp, p - bp);
161
                                g_strstrip(word);
162
                                if (*word == '\0') {
163
                                        g_free(word);
164
                                        bp = p;
165
                                        p = g_utf8_next_char(p);
166
                                        continue;
167
                                }
168
169
                                if (out_str->len > 0)
170
                                        g_string_append_c(out_str, ' ');
171
                                g_string_append(out_str, word);
172
173
                                g_free(word);
174
                        }
175
176
                        if (is_space) {
177
                                do {
178
                                        p = g_utf8_next_char(p);
179
                                        wc = g_utf8_get_char(p);
180
                                } while (g_unichar_isspace(wc));
181
                                bp = p;
182
                                continue;
183
                        }
184
                        bp = p;
185
                } else {
186
                        if (is_space) {
187
                                word_len = 0;
188
                                do {
189
                                        p = g_utf8_next_char(p);
190
                                        wc = g_utf8_get_char(p);
191
                                } while (g_unichar_isspace(wc));
192
                                bp = p;
193
                                continue;
194
                        }
195
                }
196
197
                p = g_utf8_next_char(p);
198
                word_len++;
199
        }
200
201
        if (p > bp) {
202
                gchar *word;
203
                gboolean is_drop = FALSE;
204
205
                if (drop_punct) {
206
                        is_drop = prev_is_punct;
207
                }
208
209
                if (!is_drop) {
210
                        if (normalize)
211
                                word = g_ascii_strdown(bp, -1);
212
                        else
213
                                word = g_strdup(bp);
214
                        g_strstrip(word);
215
                        if (*word != '\0') {
216
                                if (out_str->len > 0)
217
                                        g_string_append_c(out_str, ' ');
218
                                g_string_append(out_str, word);
219
                        }
220
                        g_free(word);
221
                }
222
        }
223
224
        return g_string_free(out_str, FALSE);
225
}
226
227
static XFilterStatus xfilter_wordsep_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
228
{
229
        const char *mime_type;
230
        const char *content = NULL;
231
        char *processed_content;
232
        XMessageData *msgdata;
233
234
        g_return_val_if_fail(result != NULL, XF_ERROR);
235
236
        mime_type = xfilter_message_data_get_mime_type(data);
237
        if (!mime_type) {
238
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
239
                return XF_UNSUPPORTED_TYPE;
240
        }
241
242
        if (!g_strncasecmp(mime_type, "text/", 5))
243
                content = xfilter_message_data_get_content(data);
244
        else {
245
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
246
                return XF_UNSUPPORTED_TYPE;
247
        }
248
249
        processed_content = do_wordsep(content, TRUE, TRUE, TRUE, FALSE);
250
251
        msgdata = xfilter_message_data_new(NULL, mime_type);
252
        xfilter_message_data_set_content(msgdata, processed_content);
253
254
#define WORDSEP_HEADER(attr)                                                \
255
        content = xfilter_message_data_get_attribute(data, attr);        \
256
        if (content) {                                                        \
257
                g_print("header: %s\n", content);                        \
258
                processed_content =                                        \
259
                        do_wordsep(content, TRUE, TRUE, TRUE, FALSE);        \
260
                xfilter_message_data_set_attribute                        \
261
                        (msgdata, attr, processed_content, FALSE);        \
262
                g_free(processed_content);                                \
263
        }
264
265
        WORDSEP_HEADER(XM_FROM);
266
        WORDSEP_HEADER(XM_TO);
267
        WORDSEP_HEADER(XM_CC);
268
        WORDSEP_HEADER(XM_SUBJECT);
269
270
#undef WORDSEP_HEADER
271
272
        xfilter_result_set_message_data(result, msgdata);
273
274
        xfilter_result_set_status(result, XF_REWRITTEN);
275
276
        return XF_REWRITTEN;
277
}
278
279
XFilter *xfilter_wordsep_new(void)
280
{
281
        XFilter *filter;
282
283
        filter = xfilter_new(XF_CONTENT, "wordsep");
284
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter),
285
                                        xfilter_wordsep_func);
286
287
        return filter;
288
}