Statistics
| Branch: | Tag: | Revision:

root / lib / filters / bayes-filter.c @ aebfd4cc

History | View | Annotate | Download (13.6 KB)

1
/* SylFilter - a message filter
2
 *
3
 * Copyright (C) 2011 Hiroyuki Yamamoto
4
 * Copyright (C) 2011 Sylpheed Development Team
5
 */
6

    
7
#include <glib.h>
8
#include <stdio.h>
9
#include <string.h>
10

    
11
#include "filter.h"
12
#include "filter-kvs.h"
13
#include "filter-utils.h"
14
#include "bayes-filter.h"
15

    
16
#define N_TOKENS 15
17

    
18
static XFilterKVS *junk_kvs;
19
static XFilterKVS *clean_kvs;
20
static XFilterKVS *prob_kvs;
21

    
22

    
23
/* Test */
24

    
25
typedef struct _XFilterBayesProbData
26
{
27
        GArray *array;
28
        XFilterBayesLearnStatus status;
29
} XFilterBayesProbData;
30

    
31
typedef struct _XFilterKeyCount
32
{
33
        const char *key;
34
        int count;
35
        double prob;
36
} XFilterKeyCount;
37

    
38
static void xfilter_bayes_content_word_freq(GHashTable *table, const char *prefix, const char *text)
39
{
40
        const char *bp = text, *p = text;
41
        char *word;
42
        int count;
43

    
44
        if (!text)
45
                return;
46

    
47
        while (*p != '\0') {
48
                while (*p == ' ')
49
                        p++;
50
                bp = p;
51
                while (*p != '\0' && *p != ' ')
52
                        p++;
53
                if (p > bp) {
54
                        word = g_strndup(bp, p - bp);
55
                        if (prefix) {
56
                                char *bword = word;
57
                                word = g_strconcat(prefix, "*", bword, NULL);
58
                                g_free(bword);
59
                        }
60
                        count = GPOINTER_TO_INT(g_hash_table_lookup(table, word));
61
                        count++;
62
                        g_hash_table_insert(table, word, GINT_TO_POINTER(count));
63
                }
64
        }
65
}
66

    
67
static GHashTable *xfilter_bayes_word_freq(const XMessageData *data)
68
{
69
        GHashTable *table;
70
        const char *content;
71

    
72
        table = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
73

    
74
        content = xfilter_message_data_get_attribute(data, XM_FROM);
75
        xfilter_bayes_content_word_freq(table, "From", content);
76
        content = xfilter_message_data_get_attribute(data, XM_TO);
77
        xfilter_bayes_content_word_freq(table, "To", content);
78
        content = xfilter_message_data_get_attribute(data, XM_CC);
79
        xfilter_bayes_content_word_freq(table, "Cc", content);
80
        content = xfilter_message_data_get_attribute(data, XM_SUBJECT);
81
        xfilter_bayes_content_word_freq(table, "Subject", content);
82

    
83
        content = xfilter_message_data_get_content(data);
84
        xfilter_bayes_content_word_freq(table, NULL, content);
85

    
86
        return table;
87
}
88

    
89
static char *get_degenerated_word(const char *word)
90
{
91
        const char *p;
92

    
93
        if (!word)
94
                return NULL;
95

    
96
        if ((p = strchr(word, '*'))) {
97
                return g_strdup(p + 1);
98
        }
99
        if ((p = strchr(word, '!'))) {
100
                if (*(p + 1) == '!')
101
                        return g_strndup(word, p + 1 - word);
102
                else
103
                        return g_strndup(word, p - word);
104
        }
105

    
106
        for (p = word; *p != '\0'; p++) {
107
                if (g_ascii_isupper(*p))
108
                        return g_ascii_strdown(word, -1);
109
        }
110

    
111
        return NULL;
112
}
113

    
114
static double xfilter_get_prob(const char *key, XFilterBayesLearnStatus *status, gboolean do_degeneration)
115
{
116
        int n_junk;
117
        int n_clean;
118
        int n_junk_learn;
119
        int n_clean_learn;
120
        double prob = -1.0;
121
        double upper = 0.999;
122
        double lower = 0.001;
123

    
124
        //n_junk_learn = status->junk_learned_num;
125
        n_junk_learn = status->junk_words;
126
        if (n_junk_learn < 1)
127
                return -1.0;
128
        //n_clean_learn = status->nojunk_learned_num;
129
        n_clean_learn = status->nojunk_words;
130
        if (n_clean_learn < 1)
131
                return -1.0;
132

    
133
        n_junk = xfilter_kvs_fetch_int(junk_kvs, key);
134
        n_clean = xfilter_kvs_fetch_int(clean_kvs, key) * 2;
135

    
136
        if (n_junk + n_clean == 0) {
137
                if (do_degeneration) {
138
                        char *deg_key;
139

    
140
                        deg_key = get_degenerated_word(key);
141
                        if (deg_key) {
142
                                xfilter_debug_print("[degen] %s -> %s\n", key, deg_key);
143
                                prob = xfilter_get_prob(deg_key, status, TRUE);
144
                                g_free(deg_key);
145
                        }
146
                }
147

    
148
                return prob;
149
        }
150

    
151
        if (n_junk + n_clean < 5) {
152
                switch (n_junk + n_clean) {
153
                case 1:
154
                        upper = 0.6; lower = 0.4; break;
155
                case 2:
156
                        upper = 0.7; lower = 0.3; break;
157
                case 3:
158
                        upper = 0.8; lower = 0.2; break;
159
                case 4:
160
                        upper = 0.9; lower = 0.1; break;
161
                }
162
        } 
163

    
164
        prob = ((double)n_junk / n_junk_learn) /
165
                (((double)n_clean / n_clean_learn) + ((double)n_junk / n_junk_learn));
166
        if (prob < lower) {
167
                if (n_junk == 0) {
168
                        if (n_clean > 10)
169
                                prob = lower;
170
                        else
171
                                prob = lower + 0.001;
172
                } else
173
                        prob = lower + 0.002;
174
        } else if (prob > upper) {
175
                if (n_clean == 0) {
176
                        if (n_junk > 10)
177
                                prob = upper;
178
                        else
179
                                prob = upper - 0.001;
180
                } else
181
                        prob = upper - 0.002;
182
        }
183

    
184
        xfilter_debug_print("%s: %4f (j: %d c: %d)\n", (gchar *)key, prob, n_junk, n_clean);
185

    
186
        return prob;
187
}
188

    
189
static void test_walk_func(gpointer key, gpointer val, gpointer data)
190
{
191
        XFilterBayesProbData *pdata;
192
        XFilterKeyCount kc;
193

    
194
        pdata = (XFilterBayesProbData *)data;
195
        kc.key = (gchar *)key;
196
        kc.count = GPOINTER_TO_INT(val);
197
        kc.prob = xfilter_get_prob(kc.key, &pdata->status, TRUE);
198
        //if (kc.prob > 0)
199
                //g_print("%s: (this: %d) %4f\n", kc.key, kc.count, kc.prob);
200
        if (kc.prob < 0)
201
                kc.prob = 0.4;
202
        g_array_append_val(pdata->array, kc);
203
}
204

    
205
static gint key_prob_compare_func(gconstpointer a, gconstpointer b)
206
{
207
        const XFilterKeyCount *kc1 = a;
208
        const XFilterKeyCount *kc2 = b;
209
        double da, db;
210

    
211
        da = ABS(0.5 - kc1->prob);
212
        db = ABS(0.5 - kc2->prob);
213
        return db * 10000 - da * 10000;
214
}
215

    
216
static XFilterStatus xfilter_bayes_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
217
{
218
        const char *type;
219
        GHashTable *table;
220
        XFilterBayesProbData pdata;
221
        int i;
222
        double prod = 1.0, prod_rev = 1.0;
223
        double cmb_prob;
224
        XFilterStatus status;
225

    
226
        g_return_val_if_fail(result != NULL, XF_ERROR);
227

    
228
        type = xfilter_message_data_get_mime_type(data);
229
        if (!type || g_strncasecmp(type, "text/", 5) != 0) {
230
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
231
                return XF_UNSUPPORTED_TYPE;
232
        }
233

    
234
        if (!junk_kvs) {
235
                g_warning("Cannot open junk database");
236
                xfilter_result_set_status(result, XF_ERROR);
237
                return XF_ERROR;
238
        }
239

    
240
        xfilter_debug_print("bayes-guessing message\n");
241

    
242
        xfilter_bayes_get_learn_status(&pdata.status);
243
        if (pdata.status.junk_learned_num < 1) {
244
                xfilter_debug_print("junk message not learned yet\n");
245
                cmb_prob = 0.5;
246
        } else if (pdata.status.nojunk_learned_num < 1) {
247
                xfilter_debug_print("clean message not learned yet\n");
248
                cmb_prob = 0.5;
249
        } else {
250
                xfilter_debug_print("\ncalculating probability for each tokens:\n");
251
                table = xfilter_bayes_word_freq(data);
252
                pdata.array = g_array_sized_new(FALSE, FALSE, sizeof(XFilterKeyCount), 128);
253

    
254
                g_hash_table_foreach(table, test_walk_func, &pdata);
255
                g_array_sort(pdata.array, key_prob_compare_func);
256

    
257
                xfilter_debug_print("\nmost interesting tokens:\n");
258
                for (i = 0; i < 15 && i < pdata.array->len; i++) {
259
                        XFilterKeyCount kc = g_array_index(pdata.array, XFilterKeyCount, i);
260
                        prod *= kc.prob;
261
                        prod_rev *= 1 - kc.prob;
262
                        xfilter_debug_print("%s: %d %4f\n", kc.key, kc.count, kc.prob);
263
                }
264

    
265
                cmb_prob = prod / (prod + prod_rev);
266
                xfilter_debug_print("\ncombined probability: %4f\n", cmb_prob);
267

    
268
                g_array_free(pdata.array, TRUE);
269
                g_hash_table_destroy(table);
270
        }
271

    
272

    
273
        xfilter_result_set_probability(result, cmb_prob);
274
        if (cmb_prob > 0.90)
275
                status = XF_JUNK;
276
        else if (cmb_prob < 0.10)
277
                status = XF_NOJUNK;
278
        else
279
                status = XF_UNCERTAIN;
280
        xfilter_result_set_status(result, status);
281
        
282
        return status;
283
}
284

    
285
XFilter *xfilter_bayes_new(void)
286
{
287
        XFilter *filter;
288

    
289
        filter = xfilter_new(XF_TEST, "bayes-test");
290
        xfilter_set_test_filter_func(X_TEST_FILTER(filter), xfilter_bayes_func);
291

    
292
        return filter;
293
}
294

    
295

    
296
/* Learning */
297

    
298
static void learn_walk_func(gpointer key, gpointer val, gpointer data)
299
{
300
        XFilterKVS *kvs = (XFilterKVS *)data;
301

    
302
        //g_print("%s: %d (%s)\n", (gchar *)key, GPOINTER_TO_INT(val), kvs == junk_kvs ? "j" : "c");
303
        if (xfilter_kvs_increment(kvs, (gchar *)key, GPOINTER_TO_INT(val)) < 0)
304
                g_warning("database update error");
305
}
306

    
307
static void unlearn_walk_func(gpointer key, gpointer val, gpointer data)
308
{
309
        XFilterKVS *kvs = (XFilterKVS *)data;
310

    
311
        //g_print("%s: %d (%s)\n", (gchar *)key, GPOINTER_TO_INT(val), kvs == junk_kvs ? "j" : "c");
312
        if (xfilter_kvs_decrement(kvs, (gchar *)key, GPOINTER_TO_INT(val)) < 0)
313
                g_warning("database update error");
314
}
315

    
316
static XFilterStatus xfilter_bayes_learn(XFilter *filter, const XMessageData *data, XFilterResult *result, gboolean is_junk, gboolean is_register)
317
{
318
        const char *type;
319
        GHashTable *table;
320
        XFilterKVS *kvs;
321

    
322
        g_return_val_if_fail(result != NULL, XF_ERROR);
323

    
324
        type = xfilter_message_data_get_mime_type(data);
325
        if (!type || g_strncasecmp(type, "text/", 5) != 0) {
326
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
327
                return XF_UNSUPPORTED_TYPE;
328
        }
329

    
330
        if (is_junk)
331
                kvs = junk_kvs;
332
        else
333
                kvs = clean_kvs;
334
        if (!kvs) {
335
                g_warning("xfilter_bayes_learn: Cannot open database");
336
                xfilter_result_set_status(result, XF_ERROR);
337
                return XF_ERROR;
338
        }
339

    
340
        xfilter_debug_print("%slearning %s message\n", is_register ? "" : "un", is_junk ? "junk" : "clean");
341

    
342
        table = xfilter_bayes_word_freq(data);
343
        if (is_register) {
344
                g_hash_table_foreach(table, learn_walk_func, kvs);
345
                if (is_junk)
346
                        xfilter_kvs_increment(prob_kvs, "@junk_learn_count", 1);
347
                else
348
                        xfilter_kvs_increment(prob_kvs, "@clean_learn_count", 1);
349
        } else {
350
                g_hash_table_foreach(table, unlearn_walk_func, kvs);
351
                if (is_junk)
352
                        xfilter_kvs_decrement(prob_kvs, "@junk_learn_count", 1);
353
                else
354
                        xfilter_kvs_decrement(prob_kvs, "@clean_learn_count", 1);
355
        }
356
        g_hash_table_destroy(table);
357

    
358
        xfilter_result_set_status(result, XF_NONE);
359

    
360
        return XF_NONE;
361
}
362

    
363
static XFilterStatus xfilter_bayes_learn_junk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
364
{
365
        return xfilter_bayes_learn(filter, data, result, TRUE, TRUE);
366
}
367

    
368
static XFilterStatus xfilter_bayes_learn_nojunk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
369
{
370
        return xfilter_bayes_learn(filter, data, result, FALSE, TRUE);
371
}
372

    
373
static XFilterStatus xfilter_bayes_unlearn_junk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
374
{
375
        return xfilter_bayes_learn(filter, data, result, TRUE, FALSE);
376
}
377

    
378
static XFilterStatus xfilter_bayes_unlearn_nojunk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
379
{
380
        return xfilter_bayes_learn(filter, data, result, FALSE, FALSE);
381
}
382

    
383
XFilter *xfilter_bayes_learn_junk_new(void)
384
{
385
        XFilter *filter;
386

    
387
        filter = xfilter_new(XF_CONTENT, "bayes-learn-junk");
388
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_learn_junk_func);
389

    
390
        return filter;
391
}
392

    
393
XFilter *xfilter_bayes_learn_nojunk_new(void)
394
{
395
        XFilter *filter;
396

    
397
        filter = xfilter_new(XF_CONTENT, "bayes-learn-clean");
398
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_learn_nojunk_func);
399

    
400
        return filter;
401
}
402

    
403
XFilter *xfilter_bayes_unlearn_junk_new(void)
404
{
405
        XFilter *filter;
406

    
407
        filter = xfilter_new(XF_CONTENT, "bayes-unlearn-junk");
408
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_unlearn_junk_func);
409

    
410
        return filter;
411
}
412

    
413
XFilter *xfilter_bayes_unlearn_nojunk_new(void)
414
{
415
        XFilter *filter;
416

    
417
        filter = xfilter_new(XF_CONTENT, "bayes-unlearn-clean");
418
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_unlearn_nojunk_func);
419

    
420
        return filter;
421
}
422

    
423

    
424
int xfilter_bayes_get_learn_status(XFilterBayesLearnStatus *status)
425
{
426
        g_return_val_if_fail(status != NULL, -1);
427

    
428
        status->junk_words = xfilter_kvs_count_sum(junk_kvs);
429
        status->nojunk_words = xfilter_kvs_count_sum(clean_kvs);
430
        status->junk_learned_num = xfilter_kvs_fetch_int(prob_kvs, "@junk_learn_count");
431
        status->nojunk_learned_num = xfilter_kvs_fetch_int(prob_kvs, "@clean_learn_count");
432

    
433
        return 0;
434
}
435

    
436
int xfilter_bayes_reset_learn_count(void)
437
{
438
        return 0;
439
}
440

    
441
int xfilter_bayes_reset_all(void)
442
{
443
        return 0;
444
}
445

    
446
static int show_walk_func(XFilterKVS *kvs, const char *key, void *value, int size, void *data)
447
{
448
        int ival;
449

    
450
        if (size == 4) {
451
                ival = *(int *)value;
452
                printf("%s: %d\n", key, ival);
453
        }
454

    
455
        return 0;
456
}
457

    
458
int xfilter_bayes_db_show_contents(void)
459
{
460
        XFilterBayesLearnStatus status = {0};
461

    
462
        if (!junk_kvs || !clean_kvs || !prob_kvs) {
463
                g_warning("Database not ready");
464
                return -1;
465
        }
466

    
467
        printf("Junk tokens:\n");
468
        xfilter_kvs_foreach(junk_kvs, show_walk_func, NULL);
469
        printf("\nClean tokens:\n");
470
        xfilter_kvs_foreach(clean_kvs, show_walk_func, NULL);
471

    
472
        printf("\nStatus:\n");
473
        xfilter_bayes_get_learn_status(&status);
474
        printf("junk_words: %d\n", status.junk_words);
475
        printf("nojunk_words: %d\n", status.nojunk_words);
476
        printf("junk_learned_num: %d\n", status.junk_learned_num);
477
        printf("nojunk_learned_num: %d\n", status.nojunk_learned_num);
478

    
479
        return 0;
480
}
481

    
482
int xfilter_bayes_db_init(const char *path)
483
{
484
        char *file;
485

    
486
        xfilter_debug_print("xfilter_bayes_db_init: init database\n");
487
        xfilter_debug_print("xfilter_bayes_db_init: path: %s\n",
488
                            path ? path : "(current dir)");
489

    
490
        if (path) {
491
                xfilter_debug_print("xfilter_bayes_db_init: making directory: %s\n", path);
492
                if (xfilter_utils_mkdir(path) < 0) {
493
                        g_warning("Making directory failed: %s", path);
494
                        return -1;
495
                }
496
        }
497

    
498
        if (!junk_kvs) {
499
                if (path)
500
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "junk.db",
501
                                           NULL);
502
                else
503
                        file = g_strdup("junk.db");
504
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
505
                junk_kvs = xfilter_kvs_open(file);
506
                if (!junk_kvs) {
507
                        g_warning("Cannot open database: %s", file);
508
                        g_free(file);
509
                        return -1;
510
                }
511
                g_free(file);
512
        }
513
        if (!clean_kvs) {
514
                if (path)
515
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "clean.db",
516
                                           NULL);
517
                else
518
                        file = g_strdup("clean.db");
519
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
520
                clean_kvs = xfilter_kvs_open(file);
521
                if (!clean_kvs) {
522
                        g_warning("Cannot open database: %s", file);
523
                        xfilter_kvs_close(junk_kvs);
524
                        g_free(file);
525
                        return -1;
526
                }
527
                g_free(file);
528
        }
529
        if (!prob_kvs) {
530
                if (path)
531
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "prob.db",
532
                                           NULL);
533
                else
534
                        file = g_strdup("prob.db");
535
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
536
                prob_kvs = xfilter_kvs_open(file);
537
                if (!prob_kvs) {
538
                        g_warning("Cannot open database: %s", file);
539
                        xfilter_kvs_close(clean_kvs);
540
                        xfilter_kvs_close(junk_kvs);
541
                        g_free(file);
542
                        return -1;
543
                }
544
                g_free(file);
545
        }
546

    
547
        return 0;
548
}
549

    
550
int xfilter_bayes_db_done(void)
551
{
552
        int ret = 0;
553

    
554
        xfilter_debug_print("xfilter_bayes_db_init: close database\n");
555

    
556
        if (prob_kvs)
557
                ret |= xfilter_kvs_close(prob_kvs);
558
        if (clean_kvs)
559
                ret |= xfilter_kvs_close(clean_kvs);
560
        if (junk_kvs)
561
                ret |= xfilter_kvs_close(junk_kvs);
562

    
563
        return ret;
564
}