Statistics
| Branch: | Tag: | Revision:

root / lib / filters / bayes-filter.c @ f1db2e5b

History | View | Annotate | Download (10.9 kB)

1
#include <glib.h>
2
#include <stdio.h>
3
#include <string.h>
4
5
#include "filter.h"
6
#include "filter-kvs.h"
7
#include "bayes-filter.h"
8
9
#define N_TOKENS 15
10
11
static XFilterKVS *junk_kvs;
12
static XFilterKVS *clean_kvs;
13
static XFilterKVS *prob_kvs;
14
15
16
/* Test */
17
18
typedef struct _XFilterBayesProbData
19
{
20
        GArray *array;
21
        XFilterBayesLearnStatus status;
22
} XFilterBayesProbData;
23
24
typedef struct _XFilterKeyCount
25
{
26
        const char *key;
27
        int count;
28
        double prob;
29
} XFilterKeyCount;
30
31
static void xfilter_bayes_content_word_freq(GHashTable *table, const char *prefix, const char *text)
32
{
33
        const char *bp = text, *p = text;
34
        char *word;
35
        int count;
36
37
        if (!text)
38
                return;
39
40
        while (*p != '\0') {
41
                while (*p == ' ')
42
                        p++;
43
                bp = p;
44
                while (*p != '\0' && *p != ' ')
45
                        p++;
46
                if (p > bp) {
47
                        word = g_strndup(bp, p - bp);
48
                        if (prefix) {
49
                                char *bword = word;
50
                                word = g_strconcat(prefix, "*", bword, NULL);
51
                                g_free(bword);
52
                        }
53
                        count = GPOINTER_TO_INT(g_hash_table_lookup(table, word));
54
                        count++;
55
                        g_hash_table_insert(table, word, GINT_TO_POINTER(count));
56
                }
57
        }
58
}
59
60
static GHashTable *xfilter_bayes_word_freq(const XMessageData *data)
61
{
62
        GHashTable *table;
63
        const char *content;
64
65
        table = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
66
67
        content = xfilter_message_data_get_attribute(data, XM_FROM);
68
        xfilter_bayes_content_word_freq(table, "From", content);
69
        content = xfilter_message_data_get_attribute(data, XM_TO);
70
        xfilter_bayes_content_word_freq(table, "To", content);
71
        content = xfilter_message_data_get_attribute(data, XM_CC);
72
        xfilter_bayes_content_word_freq(table, "Cc", content);
73
        content = xfilter_message_data_get_attribute(data, XM_SUBJECT);
74
        xfilter_bayes_content_word_freq(table, "Subject", content);
75
76
        content = xfilter_message_data_get_content(data);
77
        xfilter_bayes_content_word_freq(table, NULL, content);
78
79
        return table;
80
}
81
82
static char *get_degenerated_word(const char *word)
83
{
84
        const char *p;
85
86
        if (!word)
87
                return NULL;
88
89
        if ((p = strchr(word, '*'))) {
90
                return g_strdup(p + 1);
91
        }
92
        if ((p = strchr(word, '!'))) {
93
                if (*(p + 1) == '!')
94
                        return g_strndup(word, p + 1 - word);
95
                else
96
                        return g_strndup(word, p - word);
97
        }
98
99
        for (p = word; *p != '\0'; p++) {
100
                if (g_ascii_isupper(*p))
101
                        return g_ascii_strdown(word, -1);
102
        }
103
104
        return NULL;
105
}
106
107
static double xfilter_get_prob(const char *key, XFilterBayesLearnStatus *status, gboolean do_degeneration)
108
{
109
        int n_junk;
110
        int n_clean;
111
        int n_junk_learn;
112
        int n_clean_learn;
113
        double prob = -1.0;
114
115
        n_junk = xfilter_kvs_fetch_int(junk_kvs, key);
116
        n_clean = xfilter_kvs_fetch_int(clean_kvs, key) * 2;
117
118
        if (n_junk + n_clean < 5) {
119
                if (n_junk > 0 && n_clean == 0) {
120
                        g_print("* %s: (%4f) (j: %d c: %d)\n", (gchar *)key, 0.6, n_junk, n_clean);
121
                        return 0.6;
122
                }
123
124
                if (do_degeneration) {
125
                        char *deg_key;
126
127
                        deg_key = get_degenerated_word(key);
128
                        if (deg_key) {
129
                                g_print("[degen] %s -> %s\n", key, deg_key);
130
                                prob = xfilter_get_prob(deg_key, status, TRUE);
131
                                g_free(deg_key);
132
                        }
133
                }
134
135
                return prob;
136
        } 
137
138
        n_junk_learn = status->junk_learned_num;
139
        if (n_junk_learn < 1)
140
                n_junk_learn = 1;
141
        n_clean_learn = status->nojunk_learned_num;
142
        if (n_clean_learn < 1)
143
                n_clean_learn = 1;
144
145
        prob = ((double)n_junk / n_junk_learn) /
146
                (((double)n_clean / n_clean_learn) + ((double)n_junk / n_junk_learn));
147
        if (prob < 0.01)
148
                prob = 0.01;
149
        else if (prob > 0.99) {
150
                if (n_clean == 0) {
151
                        if (n_junk > 10)
152
                                prob = 0.999;
153
                        else
154
                                prob = 0.998;
155
                } else
156
                        prob = 0.99;
157
        }
158
159
        g_print("%s: %4f (j: %d c: %d)\n", (gchar *)key, prob, n_junk, n_clean);
160
161
        return prob;
162
}
163
164
static void test_walk_func(gpointer key, gpointer val, gpointer data)
165
{
166
        XFilterBayesProbData *pdata;
167
        XFilterKeyCount kc;
168
169
        pdata = (XFilterBayesProbData *)data;
170
        kc.key = (gchar *)key;
171
        kc.count = GPOINTER_TO_INT(val);
172
        kc.prob = xfilter_get_prob(kc.key, &pdata->status, TRUE);
173
        //if (kc.prob > 0)
174
                //g_print("%s: (this: %d) %4f\n", kc.key, kc.count, kc.prob);
175
        if (kc.prob < 0)
176
                kc.prob = 0.4;
177
        g_array_append_val(pdata->array, kc);
178
}
179
180
static gint key_prob_compare_func(gconstpointer a, gconstpointer b)
181
{
182
        const XFilterKeyCount *kc1 = a;
183
        const XFilterKeyCount *kc2 = b;
184
        double da, db;
185
186
        da = ABS(0.5 - kc1->prob);
187
        db = ABS(0.5 - kc2->prob);
188
        return db * 10000 - da * 10000;
189
}
190
191
static XFilterStatus xfilter_bayes_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
192
{
193
        const char *type;
194
        GHashTable *table;
195
        XFilterBayesProbData pdata;
196
        int i;
197
        double prod = 1.0, prod_rev = 1.0;
198
        double cmb_prob;
199
        XFilterStatus status;
200
201
        g_return_val_if_fail(result != NULL, XF_ERROR);
202
203
        type = xfilter_message_data_get_mime_type(data);
204
        if (!type || g_strncasecmp(type, "text/", 5) != 0) {
205
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
206
                return XF_UNSUPPORTED_TYPE;
207
        }
208
209
        if (!junk_kvs) {
210
                g_warning("Cannot open junk database");
211
                xfilter_result_set_status(result, XF_ERROR);
212
                return XF_ERROR;
213
        }
214
215
        g_print("bayes-guessing message\n");
216
217
        table = xfilter_bayes_word_freq(data);
218
        pdata.array = g_array_sized_new(FALSE, FALSE, sizeof(XFilterKeyCount), 128);
219
        xfilter_bayes_get_learn_status(&pdata.status);
220
        g_print("\ncalculating probability for each tokens:\n");
221
        g_hash_table_foreach(table, test_walk_func, &pdata);
222
        g_array_sort(pdata.array, key_prob_compare_func);
223
224
        g_print("\nmost interesting tokens:\n");
225
        for (i = 0; i < 15 && i < pdata.array->len; i++) {
226
                XFilterKeyCount kc = g_array_index(pdata.array, XFilterKeyCount, i);
227
                prod *= kc.prob;
228
                prod_rev *= 1 - kc.prob;
229
                g_print("%s: %d %4f\n", kc.key, kc.count, kc.prob);
230
        }
231
232
        cmb_prob = prod / (prod + prod_rev);
233
        g_print("\ncombined probability: %4f\n", cmb_prob);
234
235
        g_array_free(pdata.array, TRUE);
236
        g_hash_table_destroy(table);
237
238
        xfilter_result_set_probability(result, cmb_prob);
239
        if (cmb_prob > 0.90)
240
                status = XF_JUNK;
241
        else if (cmb_prob < 0.10)
242
                status = XF_NOJUNK;
243
        else
244
                status = XF_UNCERTAIN;
245
        xfilter_result_set_status(result, status);
246
        
247
        return status;
248
}
249
250
XFilter *xfilter_bayes_new(void)
251
{
252
        XFilter *filter;
253
254
        filter = xfilter_new(XF_TEST, "bayes-test");
255
        xfilter_set_test_filter_func(X_TEST_FILTER(filter), xfilter_bayes_func);
256
257
        return filter;
258
}
259
260
261
/* Learning */
262
263
static void learn_walk_func(gpointer key, gpointer val, gpointer data)
264
{
265
        XFilterKVS *kvs = (XFilterKVS *)data;
266
267
        //g_print("%s: %d (%s)\n", (gchar *)key, GPOINTER_TO_INT(val), kvs == junk_kvs ? "j" : "c");
268
        if (xfilter_kvs_increment(kvs, (gchar *)key, GPOINTER_TO_INT(val)) < 0)
269
                g_warning("database update error");
270
}
271
272
static void learn_update_prob_func(gpointer key, gpointer val, gpointer data)
273
{
274
        XFilterBayesLearnStatus *s = (XFilterBayesLearnStatus *)data;
275
        double prob;
276
        char prob_str[7] = "";
277
278
        prob = xfilter_get_prob((gchar *)key, s, FALSE);
279
        if (prob < 0)
280
                return;
281
282
        g_snprintf(prob_str, sizeof(prob_str), "%4f", prob);
283
        if (xfilter_kvs_update(prob_kvs, (gchar *)key, prob_str, 6) < 0)
284
                g_warning("prob database update error");
285
}
286
287
static XFilterStatus xfilter_bayes_learn(XFilter *filter, const XMessageData *data, XFilterResult *result, gboolean is_junk)
288
{
289
        const char *type;
290
        GHashTable *table;
291
        XFilterKVS *kvs;
292
        XFilterBayesLearnStatus status = {0};
293
294
        g_return_val_if_fail(result != NULL, XF_ERROR);
295
296
        type = xfilter_message_data_get_mime_type(data);
297
        if (!type || g_strncasecmp(type, "text/", 5) != 0) {
298
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
299
                return XF_UNSUPPORTED_TYPE;
300
        }
301
302
        if (is_junk)
303
                kvs = junk_kvs;
304
        else
305
                kvs = clean_kvs;
306
        if (!kvs) {
307
                g_warning("xfilter_bayes_learn: Cannot open database");
308
                xfilter_result_set_status(result, XF_ERROR);
309
                return XF_ERROR;
310
        }
311
312
        g_print("learning message\n");
313
        table = xfilter_bayes_word_freq(data);
314
        g_hash_table_foreach(table, learn_walk_func, kvs);
315
        if (is_junk)
316
                xfilter_kvs_increment(prob_kvs, "@junk_learn_count", 1);
317
        else
318
                xfilter_kvs_increment(prob_kvs, "@clean_learn_count", 1);
319
        xfilter_bayes_get_learn_status(&status);
320
        g_hash_table_foreach(table, learn_update_prob_func, &status);
321
        g_hash_table_destroy(table);
322
323
        xfilter_result_set_status(result, XF_NONE);
324
325
        return XF_NONE;
326
}
327
328
static XFilterStatus xfilter_bayes_learn_junk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
329
{
330
        return xfilter_bayes_learn(filter, data, result, TRUE);
331
}
332
333
static XFilterStatus xfilter_bayes_learn_nojunk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
334
{
335
        return xfilter_bayes_learn(filter, data, result, FALSE);
336
}
337
338
XFilter *xfilter_bayes_learn_junk_new(void)
339
{
340
        XFilter *filter;
341
342
        filter = xfilter_new(XF_CONTENT, "bayes-learn-junk");
343
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_learn_junk_func);
344
345
        return filter;
346
}
347
348
XFilter *xfilter_bayes_learn_nojunk_new(void)
349
{
350
        XFilter *filter;
351
352
        filter = xfilter_new(XF_CONTENT, "bayes-learn-clean");
353
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_learn_nojunk_func);
354
355
        return filter;
356
}
357
358
359
int xfilter_bayes_get_learn_status(XFilterBayesLearnStatus *status)
360
{
361
        g_return_val_if_fail(status != NULL, -1);
362
363
        status->junk_words = xfilter_kvs_count_sum(junk_kvs);
364
        status->nojunk_words = xfilter_kvs_count_sum(clean_kvs);
365
        status->junk_learned_num = xfilter_kvs_fetch_int(prob_kvs, "@junk_learn_count");
366
        status->nojunk_learned_num = xfilter_kvs_fetch_int(prob_kvs, "@clean_learn_count");
367
368
        return 0;
369
}
370
371
int xfilter_bayes_reset_learn_count(void)
372
{
373
        return 0;
374
}
375
376
int xfilter_bayes_reset_all(void)
377
{
378
        return 0;
379
}
380
381
static int show_walk_func(XFilterKVS *kvs, const char *key, void *value, int size, void *data)
382
{
383
        int ival;
384
385
        if (size == 4) {
386
                ival = *(int *)value;
387
                printf("%s: %d\n", key, ival);
388
        }
389
390
        return 0;
391
}
392
393
int xfilter_bayes_db_show_contents(void)
394
{
395
        XFilterBayesLearnStatus status = {0};
396
397
        if (!junk_kvs || !clean_kvs || !prob_kvs) {
398
                g_warning("Database not ready");
399
                return -1;
400
        }
401
402
        printf("Junk tokens:\n");
403
        xfilter_kvs_foreach(junk_kvs, show_walk_func, NULL);
404
        printf("\nClean tokens:\n");
405
        xfilter_kvs_foreach(clean_kvs, show_walk_func, NULL);
406
407
        printf("\nStatus:\n");
408
        xfilter_bayes_get_learn_status(&status);
409
        printf("junk_words: %d\n", status.junk_words);
410
        printf("nojunk_words: %d\n", status.nojunk_words);
411
        printf("junk_learned_num: %d\n", status.junk_learned_num);
412
        printf("nojunk_learned_num: %d\n", status.nojunk_learned_num);
413
414
        return 0;
415
}
416
417
int xfilter_bayes_db_init(void)
418
{
419
        g_print("xfilter_bayes_db_init: init database\n");
420
421
        if (!junk_kvs) {
422
                junk_kvs = xfilter_kvs_open("junk.db");
423
                if (!junk_kvs) {
424
                        g_warning("Cannot open database: junk.db");
425
                        return -1;
426
                }
427
        }
428
        if (!clean_kvs) {
429
                clean_kvs = xfilter_kvs_open("clean.db");
430
                if (!clean_kvs) {
431
                        g_warning("Cannot open database: clean.db");
432
                        xfilter_kvs_close(junk_kvs);
433
                        return -1;
434
                }
435
        }
436
        if (!prob_kvs) {
437
                prob_kvs = xfilter_kvs_open("prob.db");
438
                if (!prob_kvs) {
439
                        g_warning("Cannot open database: prob.db");
440
                        xfilter_kvs_close(clean_kvs);
441
                        xfilter_kvs_close(junk_kvs);
442
                        return -1;
443
                }
444
        }
445
446
        return 0;
447
}
448
449
int xfilter_bayes_db_done(void)
450
{
451
        int ret = 0;
452
453
        g_print("xfilter_bayes_db_init: close database\n");
454
455
        if (prob_kvs)
456
                ret |= xfilter_kvs_close(prob_kvs);
457
        if (clean_kvs)
458
                ret |= xfilter_kvs_close(clean_kvs);
459
        if (junk_kvs)
460
                ret |= xfilter_kvs_close(junk_kvs);
461
462
        return ret;
463
}