Statistics
| Branch: | Tag: | Revision:

root / lib / filters / bayes-filter.c @ 90da63dd

History | View | Annotate | Download (24.2 KB)

1
/* SylFilter - a message filter
2
 *
3
 * Copyright (C) 2011-2012 Hiroyuki Yamamoto
4
 * Copyright (C) 2011-2012 Sylpheed Development Team
5
 */
6

    
7
#include "config.h"
8

    
9
#include <glib.h>
10
#include <stdio.h>
11
#include <string.h>
12
#include <math.h>
13
#include <unistd.h>
14
#include <errno.h>
15

    
16
#include "filter.h"
17
#include "filter-kvs.h"
18
#include "filter-utils.h"
19
#include "bayes-filter.h"
20

    
21
#ifdef BUILTIN_LIBSYLPH
22
#  include "libsylph/utils.h"
23
#else
24
#  include <sylph/utils.h>
25
#endif
26

    
27
#define N_TOKENS 15
28
#undef USE_STATUS_KVS
29

    
30
static XFilterKVS *junk_kvs;
31
static XFilterKVS *clean_kvs;
32

    
33
#ifdef USE_STATUS_KVS
34
  static XFilterKVS *prob_kvs;
35
#else
36
  static XFilterBayesLearnStatus learn_status;
37
  static char *status_file;
38
  static char *status_file_tmp;
39
#endif
40

    
41
/* Test */
42

    
43
typedef struct _XFilterBayesProbData
44
{
45
        GArray *array;
46
        XFilterBayesLearnStatus status;
47
        double robs;
48
        double robx;
49
} XFilterBayesProbData;
50

    
51
typedef struct _XFilterKeyCount
52
{
53
        const char *key;
54
        int count;
55
        double prob;
56
} XFilterKeyCount;
57

    
58
typedef struct _XFilterKeyCount2
59
{
60
        const char *key;
61
        int n_junk;
62
        int n_clean;
63
} XFilterKeyCount2;
64

    
65
static void xfilter_bayes_content_word_freq(GHashTable *table, const char *prefix, const char *text)
66
{
67
        const char *bp = text, *p = text;
68
        char *word;
69
        int count;
70

    
71
        if (!text)
72
                return;
73

    
74
        while (*p != '\0') {
75
                while (*p == ' ')
76
                        p++;
77
                bp = p;
78
                while (*p != '\0' && *p != ' ')
79
                        p++;
80
                if (p > bp) {
81
                        word = g_strndup(bp, p - bp);
82
                        if (prefix) {
83
                                char *bword = word;
84
                                word = g_strconcat(prefix, "*", bword, NULL);
85
                                g_free(bword);
86
                        }
87
                        count = GPOINTER_TO_INT(g_hash_table_lookup(table, word));
88
                        count++;
89
                        g_hash_table_insert(table, word, GINT_TO_POINTER(count));
90
                }
91
        }
92
}
93

    
94
static GHashTable *xfilter_bayes_word_freq(const XMessageData *data)
95
{
96
        GHashTable *table;
97
        const char *content;
98

    
99
        table = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
100

    
101
        content = xfilter_message_data_get_attribute(data, XM_FROM);
102
        xfilter_bayes_content_word_freq(table, "From", content);
103
        content = xfilter_message_data_get_attribute(data, XM_TO);
104
        xfilter_bayes_content_word_freq(table, "To", content);
105
        content = xfilter_message_data_get_attribute(data, XM_CC);
106
        xfilter_bayes_content_word_freq(table, "Cc", content);
107
        content = xfilter_message_data_get_attribute(data, XM_SUBJECT);
108
        xfilter_bayes_content_word_freq(table, "Subject", content);
109
        content = xfilter_message_data_get_attribute(data, XM_RECEIVED);
110
        xfilter_bayes_content_word_freq(table, "Received", content);
111

    
112
        content = xfilter_message_data_get_content(data);
113
        xfilter_bayes_content_word_freq(table, NULL, content);
114

    
115
        return table;
116
}
117

    
118
static char *get_degenerated_word(const char *word)
119
{
120
        const char *p;
121

    
122
        if (!word)
123
                return NULL;
124

    
125
        if ((p = strchr(word, '*'))) {
126
                return g_strdup(p + 1);
127
        }
128
        if ((p = strchr(word, '!'))) {
129
                if (*(p + 1) == '!')
130
                        return g_strndup(word, p + 1 - word);
131
                else
132
                        return g_strndup(word, p - word);
133
        }
134

    
135
        for (p = word; *p != '\0'; p++) {
136
                if (g_ascii_isupper(*p))
137
                        return g_ascii_strdown(word, -1);
138
        }
139

    
140
        return NULL;
141
}
142

    
143
static double xfilter_get_prob_naive(const char *key, XFilterBayesLearnStatus *status, gboolean do_degeneration)
144
{
145
        int n_junk;
146
        int n_clean;
147
        int n_junk_learn;
148
        int n_clean_learn;
149
        double prob = -1.0;
150
        double upper = 0.999;
151
        double lower = 0.001;
152
        int clean_bias = 2;
153

    
154
        //n_junk_learn = status->junk_learned_num;
155
        n_junk_learn = status->junk_words;
156
        if (n_junk_learn < 1)
157
                return -1.0;
158
        //n_clean_learn = status->nojunk_learned_num;
159
        n_clean_learn = status->nojunk_words;
160
        if (n_clean_learn < 1)
161
                return -1.0;
162

    
163
        if (xfilter_get_conf_value("no-bias") != NULL)
164
                clean_bias = 1;
165

    
166
        n_junk = xfilter_kvs_fetch_int(junk_kvs, key);
167
        n_clean = xfilter_kvs_fetch_int(clean_kvs, key) * clean_bias;
168

    
169
        if (n_junk + n_clean == 0) {
170
                if (do_degeneration) {
171
                        char *deg_key;
172

    
173
                        deg_key = get_degenerated_word(key);
174
                        if (deg_key) {
175
                                xfilter_debug_print("[degen] %s -> %s\n", key, deg_key);
176
                                prob = xfilter_get_prob_naive(deg_key, status, TRUE);
177
                                g_free(deg_key);
178
                        }
179
                }
180

    
181
                return prob;
182
        }
183

    
184
        if (n_junk + n_clean < 5) {
185
                switch (n_junk + n_clean) {
186
                case 1:
187
                        upper = 0.6; lower = 0.4; break;
188
                case 2:
189
                        upper = 0.7; lower = 0.3; break;
190
                case 3:
191
                        upper = 0.8; lower = 0.2; break;
192
                case 4:
193
                        upper = 0.9; lower = 0.1; break;
194
                }
195
        } 
196

    
197
        prob = ((double)n_junk / n_junk_learn) /
198
                (((double)n_clean / n_clean_learn) + ((double)n_junk / n_junk_learn));
199
        if (prob < lower) {
200
                if (n_junk == 0) {
201
                        if (n_clean > 10)
202
                                prob = lower;
203
                        else
204
                                prob = lower + 0.001;
205
                } else
206
                        prob = lower + 0.002;
207
        } else if (prob > upper) {
208
                if (n_clean == 0) {
209
                        if (n_junk > 10)
210
                                prob = upper;
211
                        else
212
                                prob = upper - 0.001;
213
                } else
214
                        prob = upper - 0.002;
215
        }
216

    
217
        xfilter_debug_print("%s: %4f (j: %d c: %d)\n", (gchar *)key, prob, n_junk, n_clean);
218

    
219
        return prob;
220
}
221

    
222
static void naive_test_walk_func(gpointer key, gpointer val, gpointer data)
223
{
224
        XFilterBayesProbData *pdata;
225
        XFilterKeyCount kc;
226

    
227
        pdata = (XFilterBayesProbData *)data;
228
        kc.key = (gchar *)key;
229
        kc.count = GPOINTER_TO_INT(val);
230
        kc.prob = xfilter_get_prob_naive(kc.key, &pdata->status, TRUE);
231
        //if (kc.prob > 0)
232
                //g_print("%s: (this: %d) %4f\n", kc.key, kc.count, kc.prob);
233
        if (kc.prob < 0)
234
                kc.prob = 0.4;
235
        g_array_append_val(pdata->array, kc);
236
}
237

    
238
static gint key_prob_compare_func(gconstpointer a, gconstpointer b)
239
{
240
        const XFilterKeyCount *kc1 = a;
241
        const XFilterKeyCount *kc2 = b;
242
        double da, db;
243

    
244
        da = ABS(0.5 - kc1->prob);
245
        db = ABS(0.5 - kc2->prob);
246
        return db * 10000 - da * 10000;
247
}
248

    
249
static double xfilter_get_combined_prob_naive(const XMessageData *data, XFilterBayesProbData *pdata)
250
{
251
        GHashTable *table;
252
        double prod = 1.0, prod_rev = 1.0;
253
        double cmb_prob;
254
        int i;
255

    
256
        xfilter_debug_print("\ncalculating probability for each tokens:\n");
257

    
258
        table = xfilter_bayes_word_freq(data);
259
        pdata->array = g_array_sized_new(FALSE, FALSE, sizeof(XFilterKeyCount), 128);
260

    
261
        xfilter_kvs_begin(junk_kvs);
262
        xfilter_kvs_begin(clean_kvs);
263
        g_hash_table_foreach(table, naive_test_walk_func, pdata);
264
        xfilter_kvs_end(junk_kvs);
265
        xfilter_kvs_end(clean_kvs);
266
        g_array_sort(pdata->array, key_prob_compare_func);
267

    
268
        xfilter_debug_print("\nmost interesting tokens:\n");
269
        for (i = 0; i < 15 && i < pdata->array->len; i++) {
270
                XFilterKeyCount kc = g_array_index(pdata->array, XFilterKeyCount, i);
271
                prod *= kc.prob;
272
                prod_rev *= 1 - kc.prob;
273
                xfilter_debug_print("%s: %d %4f\n", kc.key, kc.count, kc.prob);
274
        }
275

    
276
        cmb_prob = prod / (prod + prod_rev);
277
        xfilter_debug_print("\ncombined probability (Paul/Naive): %4f\n", cmb_prob);
278

    
279
        g_array_free(pdata->array, TRUE);
280
        g_hash_table_destroy(table);
281

    
282
        return cmb_prob;
283
}
284

    
285
static double xfilter_get_prob_fisher(const char *key, XFilterBayesLearnStatus *status, double s, double x, gboolean do_degeneration)
286
{
287
        int n_junk;
288
        int n_clean;
289
        int n_junk_learn;
290
        int n_clean_learn;
291
        double upper = 0.999999;
292
        double lower = 0.000001;
293
        double scalefactor;
294
        double f_w = 0.5;
295

    
296
        //n_junk_learn = status->junk_learned_num;
297
        n_junk_learn = status->junk_words;
298
        if (n_junk_learn < 1)
299
                return -1.0;
300
        //n_clean_learn = status->nojunk_learned_num;
301
        n_clean_learn = status->nojunk_words;
302
        if (n_clean_learn < 1)
303
                return -1.0;
304
        if (s < 0.01)
305
                return -1.0;
306
        if (x < 0.01 || x > 0.99)
307
                return -1.0;
308

    
309
        n_junk = xfilter_kvs_fetch_int(junk_kvs, key);
310
        n_clean = xfilter_kvs_fetch_int(clean_kvs, key);
311

    
312
        if (n_junk + n_clean == 0) {
313
                if (do_degeneration) {
314
                        char *deg_key;
315

    
316
                        deg_key = get_degenerated_word(key);
317
                        if (deg_key) {
318
                                xfilter_debug_print("[degen] %s -> %s\n", key, deg_key);
319
                                f_w = xfilter_get_prob_fisher(deg_key, status, s, x, TRUE);
320
                                g_free(deg_key);
321
                        }
322
                }
323

    
324
                return f_w;
325
        }
326

    
327
        scalefactor = (double)n_junk_learn / n_clean_learn;
328
        f_w = (s * x + n_junk) / (s + n_junk + n_clean * scalefactor);
329

    
330
        if (f_w < lower)
331
                f_w = lower;
332
        else if (f_w > upper)
333
                f_w = upper;
334

    
335
        xfilter_debug_print("%s: %4f (j: %d c: %d)\n", (gchar *)key, f_w, n_junk, n_clean);
336

    
337
        return f_w;
338
}
339

    
340
static void fisher_test_walk_func(gpointer key, gpointer val, gpointer data)
341
{
342
        XFilterBayesProbData *pdata;
343
        XFilterKeyCount kc;
344

    
345
        pdata = (XFilterBayesProbData *)data;
346
        kc.key = (gchar *)key;
347
        kc.count = GPOINTER_TO_INT(val);
348
        kc.prob = xfilter_get_prob_fisher(kc.key, &pdata->status, pdata->robs, pdata->robx, TRUE);
349
        if (kc.prob < 0)
350
                kc.prob = 0.5;
351
        g_array_append_val(pdata->array, kc);
352
}
353

    
354
/* inverse chi-squared function */
355
static double chi2q(double x2, double v)
356
{
357
        double m;
358
        double sum;
359
        double term;
360
        int i;
361

    
362
        m = x2 / 2.0;
363
        sum = term = exp(0.0 - m);
364

    
365
        for (i = 1; i < (v / 2) - 1; i++) {
366
                term *= m / i;
367
                sum += term;
368
        }
369

    
370
        return sum < 1.0 ? sum : 1.0;
371
}
372

    
373
static double xfilter_get_combined_prob_fisher(const XMessageData *data, XFilterBayesProbData *pdata)
374
{
375
        GHashTable *table;
376
        const char *val;
377
        char *p;
378
        double sum = 0.0, sum_rev = 0.0;
379
        int count = 0;
380
        double P, Q;
381
        int N;
382
        int i;
383
        double min_dev = 0.1;
384
        double s = 1.0;
385
        double x = 0.5;
386
        double cmb_prob;
387

    
388
        xfilter_debug_print("\ncalculating probability for each tokens:\n");
389

    
390
        val = xfilter_get_conf_value("min-dev");
391
        if (val) {
392
                min_dev = strtod(val, &p);
393
                if (p == val)
394
                        min_dev = 0.1;
395
                else if (min_dev > 0.499)
396
                        min_dev = 0.499;
397
        }
398
        val = xfilter_get_conf_value("robs");
399
        if (val) {
400
                s = strtod(val, &p);
401
                if (p == val)
402
                        s = 1.0;
403
                else if (s < 0.01)
404
                        s = 0.01;
405
                else if (s > 1.0)
406
                        s = 1.0;
407
        }
408
        val = xfilter_get_conf_value("robx");
409
        if (val) {
410
                x = strtod(val, &p);
411
                if (p == val)
412
                        x = 0.5;
413
                else if (x < 0.01)
414
                        x = 0.01;
415
                else if (x > 0.99)
416
                        x = 0.99;
417
        }
418

    
419
        table = xfilter_bayes_word_freq(data);
420
        pdata->array = g_array_sized_new(FALSE, FALSE, sizeof(XFilterKeyCount), 128);
421
        pdata->robs = s;
422
        pdata->robx = x;
423

    
424
        xfilter_kvs_begin(junk_kvs);
425
        xfilter_kvs_begin(clean_kvs);
426
        g_hash_table_foreach(table, fisher_test_walk_func, pdata);
427
        xfilter_kvs_end(junk_kvs);
428
        xfilter_kvs_end(clean_kvs);
429

    
430
        xfilter_debug_print("\ninteresting tokens:\n");
431
        for (i = 0; i < pdata->array->len; i++) {
432
                XFilterKeyCount kc = g_array_index(pdata->array, XFilterKeyCount, i);
433
                if (ABS(kc.prob - 0.5) > min_dev) {
434
                        sum_rev += log(1 - kc.prob);
435
                        sum += log(kc.prob);
436
                        count++;
437
                        xfilter_debug_print("%s: %d %4f\n", kc.key, kc.count, kc.prob);
438
                }
439
        }
440

    
441
        N = count;
442
        P = chi2q(-2 * sum_rev, 2 * N);
443
        Q = chi2q(-2 * sum, 2 * N);
444
        cmb_prob = (1 + Q - P) / 2;
445
        xfilter_debug_print("\ncombined probability (Robinson-Fisher): %4f (min_dev: %f, s: %f, x: %f, N: %d, P = %f, Q = %f\n", cmb_prob, min_dev, s, x, N, P, Q);
446

    
447
        g_array_free(pdata->array, TRUE);
448
        g_hash_table_destroy(table);
449

    
450
        return cmb_prob;
451
}
452

    
453
static XFilterStatus xfilter_bayes_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
454
{
455
        const char *type;
456
        XFilterBayesProbData pdata;
457
        double cmb_prob;
458
        XFilterStatus status;
459
        const char *method;
460

    
461
        g_return_val_if_fail(result != NULL, XF_ERROR);
462

    
463
        type = xfilter_message_data_get_mime_type(data);
464
        if (!type || g_strncasecmp(type, "text/", 5) != 0) {
465
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
466
                return XF_UNSUPPORTED_TYPE;
467
        }
468

    
469
        if (!junk_kvs) {
470
                g_warning("Cannot open junk database");
471
                xfilter_result_set_status(result, XF_ERROR);
472
                return XF_ERROR;
473
        }
474

    
475
        xfilter_debug_print("bayes-guessing message\n");
476

    
477
        method = xfilter_get_conf_value("method");
478

    
479
        xfilter_bayes_get_learn_status(&pdata.status);
480
        if (pdata.status.junk_learned_num < 1) {
481
                xfilter_debug_print("junk message not learned yet\n");
482
                cmb_prob = 0.5;
483
        } else if (pdata.status.nojunk_learned_num < 1) {
484
                xfilter_debug_print("clean message not learned yet\n");
485
                cmb_prob = 0.5;
486
        } else {
487
                if (method && method[0] == 'n')
488
                        cmb_prob = xfilter_get_combined_prob_naive(data, &pdata);
489
                else
490
                        cmb_prob = xfilter_get_combined_prob_fisher(data, &pdata);
491
        }
492

    
493
        xfilter_result_set_probability(result, cmb_prob);
494
        if (cmb_prob > 0.90)
495
                status = XF_JUNK;
496
        else if (cmb_prob < 0.10)
497
                status = XF_NOJUNK;
498
        else
499
                status = XF_UNCERTAIN;
500
        xfilter_result_set_status(result, status);
501
        
502
        return status;
503
}
504

    
505
XFilter *xfilter_bayes_new(void)
506
{
507
        XFilter *filter;
508

    
509
        filter = xfilter_new(XF_TEST, "bayes-test");
510
        xfilter_set_test_filter_func(X_TEST_FILTER(filter), xfilter_bayes_func);
511

    
512
        return filter;
513
}
514

    
515

    
516
/* Learning */
517

    
518
typedef struct _XFilterLearnWalkData
519
{
520
        XFilterKVS *kvs;
521
        int sum;
522
} XFilterLearnWalkData;
523

    
524
static void learn_walk_func(gpointer key, gpointer val, gpointer data)
525
{
526
        XFilterLearnWalkData *lwd = (XFilterLearnWalkData *)data;
527

    
528
        //g_print("%s: %d (%s)\n", (gchar *)key, GPOINTER_TO_INT(val), kvs == junk_kvs ? "j" : "c");
529
        if (xfilter_kvs_increment(lwd->kvs, (gchar *)key, GPOINTER_TO_INT(val)) < 0)
530
                g_warning("database update error");
531
        lwd->sum += GPOINTER_TO_INT(val);
532
}
533

    
534
static void unlearn_walk_func(gpointer key, gpointer val, gpointer data)
535
{
536
        XFilterKVS *kvs = (XFilterKVS *)data;
537

    
538
        //g_print("%s: %d (%s)\n", (gchar *)key, GPOINTER_TO_INT(val), kvs == junk_kvs ? "j" : "c");
539
        if (xfilter_kvs_decrement(kvs, (gchar *)key, GPOINTER_TO_INT(val)) < 0)
540
                g_warning("database update error");
541
}
542

    
543
static int xfilter_update_status(gboolean is_junk, gboolean is_register, int sum_add)
544
{
545
#ifdef USE_STATUS_KVS
546
        xfilter_kvs_begin(prob_kvs);
547
        if (is_register) {
548
                if (is_junk) {
549
                        xfilter_kvs_increment(prob_kvs, "@junk_words_sum", sum_add);
550
                        xfilter_kvs_increment(prob_kvs, "@junk_learn_count", 1);
551
                } else {
552
                        xfilter_kvs_increment(prob_kvs, "@clean_words_sum", sum_add);
553
                        xfilter_kvs_increment(prob_kvs, "@clean_learn_count", 1);
554
                }
555
        } else {
556
                if (is_junk) {
557
                        xfilter_kvs_set_int(prob_kvs, "@junk_words_sum", sum_add);
558
                        xfilter_kvs_decrement(prob_kvs, "@junk_learn_count", 1);
559
                } else {
560
                        xfilter_kvs_set_int(prob_kvs, "@clean_words_sum", sum_add);
561
                        xfilter_kvs_decrement(prob_kvs, "@clean_learn_count", 1);
562
                }
563
        }
564
        xfilter_kvs_end(prob_kvs);
565

    
566
        return 0;
567
#else /* !USE_STATUS_KVS */
568
        FILE *status_fp;
569

    
570
        if (is_register) {
571
                if (is_junk) {
572
                        learn_status.junk_words += sum_add;
573
                        learn_status.junk_learned_num++;
574
                } else {
575
                        learn_status.nojunk_words += sum_add;
576
                        learn_status.nojunk_learned_num++;
577
                }
578
        } else {
579
                if (is_junk) {
580
                        learn_status.junk_words = sum_add;
581
                        if (learn_status.junk_learned_num > 0)
582
                                learn_status.junk_learned_num--;
583
                } else {
584
                        learn_status.nojunk_words = sum_add;
585
                        if (learn_status.nojunk_learned_num > 0)
586
                                learn_status.nojunk_learned_num--;
587
                }
588
        }
589

    
590
        xfilter_debug_print("xfilter_update_status: writing status to file\n");
591

    
592
        status_fp = g_fopen(status_file_tmp, "wb");
593
        if (!status_fp) {
594
                perror("fopen");
595
                return -1;
596
        }
597
        fprintf(status_fp,
598
                "version=1\n"
599
                "junk_words_sum=%d\n"
600
                "junk_learn_count=%d\n"
601
                "clean_words_sum=%d\n"
602
                "clean_learn_count=%d\n",
603
                learn_status.junk_words,
604
                learn_status.junk_learned_num,
605
                learn_status.nojunk_words,
606
                learn_status.nojunk_learned_num);
607

    
608
        if (fflush(status_fp) < 0) {
609
                perror("fflush");
610
                fclose(status_fp);
611
                g_unlink(status_file_tmp);
612
                return -1;
613
        }
614
#if HAVE_FSYNC
615
        if (fsync(fileno(status_fp)) < 0) {
616
                perror("fsync");
617
        }
618
#elif defined(G_OS_WIN32)
619
        if (_commit(_fileno(status_fp)) < 0) {
620
                perror("_commit");
621
        }
622
#endif
623
        fclose(status_fp);
624
        if (rename_force(status_file_tmp, status_file) < 0) {
625
                perror("rename");
626
                return -1;
627
        }
628

    
629
        xfilter_debug_print("xfilter_update_status: done\n");
630

    
631
        return 0;
632
#endif /* !USE_STATUS_KVS */
633
}
634

    
635
static XFilterStatus xfilter_bayes_learn(XFilter *filter, const XMessageData *data, XFilterResult *result, gboolean is_junk, gboolean is_register)
636
{
637
        const char *type;
638
        GHashTable *table;
639
        XFilterKVS *kvs;
640
        int sum_add;
641

    
642
        g_return_val_if_fail(result != NULL, XF_ERROR);
643

    
644
        type = xfilter_message_data_get_mime_type(data);
645
        if (!type || g_strncasecmp(type, "text/", 5) != 0) {
646
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
647
                return XF_UNSUPPORTED_TYPE;
648
        }
649

    
650
        if (is_junk)
651
                kvs = junk_kvs;
652
        else
653
                kvs = clean_kvs;
654
        if (!kvs) {
655
                g_warning("xfilter_bayes_learn: Cannot open database");
656
                xfilter_result_set_status(result, XF_ERROR);
657
                return XF_ERROR;
658
        }
659

    
660
        xfilter_debug_print("%slearning %s message\n", is_register ? "" : "un", is_junk ? "junk" : "clean");
661

    
662
        table = xfilter_bayes_word_freq(data);
663
        xfilter_kvs_begin(kvs);
664
        if (is_register) {
665
                XFilterLearnWalkData lwd = {kvs, 0};
666

    
667
                g_hash_table_foreach(table, learn_walk_func, &lwd);
668
                sum_add = lwd.sum;
669
        } else {
670
                g_hash_table_foreach(table, unlearn_walk_func, kvs);
671
                sum_add = xfilter_kvs_count_sum(kvs);
672
        }
673
        xfilter_kvs_end(kvs);
674
        g_hash_table_destroy(table);
675

    
676
        xfilter_update_status(is_junk, is_register, sum_add);
677

    
678
        xfilter_result_set_status(result, XF_NONE);
679

    
680
        return XF_NONE;
681
}
682

    
683
static XFilterStatus xfilter_bayes_learn_junk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
684
{
685
        return xfilter_bayes_learn(filter, data, result, TRUE, TRUE);
686
}
687

    
688
static XFilterStatus xfilter_bayes_learn_nojunk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
689
{
690
        return xfilter_bayes_learn(filter, data, result, FALSE, TRUE);
691
}
692

    
693
static XFilterStatus xfilter_bayes_unlearn_junk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
694
{
695
        return xfilter_bayes_learn(filter, data, result, TRUE, FALSE);
696
}
697

    
698
static XFilterStatus xfilter_bayes_unlearn_nojunk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
699
{
700
        return xfilter_bayes_learn(filter, data, result, FALSE, FALSE);
701
}
702

    
703
XFilter *xfilter_bayes_learn_junk_new(void)
704
{
705
        XFilter *filter;
706

    
707
        filter = xfilter_new(XF_CONTENT, "bayes-learn-junk");
708
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_learn_junk_func);
709

    
710
        return filter;
711
}
712

    
713
XFilter *xfilter_bayes_learn_nojunk_new(void)
714
{
715
        XFilter *filter;
716

    
717
        filter = xfilter_new(XF_CONTENT, "bayes-learn-clean");
718
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_learn_nojunk_func);
719

    
720
        return filter;
721
}
722

    
723
XFilter *xfilter_bayes_unlearn_junk_new(void)
724
{
725
        XFilter *filter;
726

    
727
        filter = xfilter_new(XF_CONTENT, "bayes-unlearn-junk");
728
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_unlearn_junk_func);
729

    
730
        return filter;
731
}
732

    
733
XFilter *xfilter_bayes_unlearn_nojunk_new(void)
734
{
735
        XFilter *filter;
736

    
737
        filter = xfilter_new(XF_CONTENT, "bayes-unlearn-clean");
738
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_unlearn_nojunk_func);
739

    
740
        return filter;
741
}
742

    
743

    
744
int xfilter_bayes_get_learn_status(XFilterBayesLearnStatus *status)
745
{
746
        g_return_val_if_fail(status != NULL, -1);
747

    
748
#ifdef USE_STATUS_KVS
749
        status->junk_words = xfilter_kvs_fetch_int(prob_kvs, "@junk_words_sum");
750
        status->nojunk_words = xfilter_kvs_fetch_int(prob_kvs, "@clean_words_sum");
751
        status->junk_learned_num = xfilter_kvs_fetch_int(prob_kvs, "@junk_learn_count");
752
        status->nojunk_learned_num = xfilter_kvs_fetch_int(prob_kvs, "@clean_learn_count");
753
#else
754
        *status = learn_status;
755
#endif
756

    
757
        return 0;
758
}
759

    
760
int xfilter_bayes_reset_learn_count(void)
761
{
762
        return 0;
763
}
764

    
765
int xfilter_bayes_reset_all(void)
766
{
767
        return 0;
768
}
769

    
770
static int show_walk_func(XFilterKVS *kvs, const char *key, void *value, int size, void *data)
771
{
772
        int ival;
773
        GHashTable *table = (GHashTable *)data;
774
        XFilterKeyCount2 *kc;
775

    
776
        if (size == 4) {
777
                ival = *(gint32 *)value;
778
                //printf("%s: %d\n", key, ival);
779
                kc = g_hash_table_lookup(table, key);
780
                if (!kc) {
781
                        kc = g_new0(XFilterKeyCount2, 1);
782
                        kc->key = g_strdup(key);
783
                        g_hash_table_insert(table, (char *)kc->key, kc);
784
                }
785
                if (kvs == junk_kvs)
786
                        kc->n_junk = ival;
787
                else
788
                        kc->n_clean = ival;
789
        }
790

    
791
        return 0;
792
}
793

    
794
static gint key_count_compare_func(gconstpointer a, gconstpointer b)
795
{
796
        const XFilterKeyCount2 *kc1 = *(XFilterKeyCount2 **)a;
797
        const XFilterKeyCount2 *kc2 = *(XFilterKeyCount2 **)b;
798

    
799
        return (kc2->n_junk + kc2->n_clean) - (kc1->n_junk + kc1->n_clean);
800
}
801

    
802
static void kc2_walk_func(gpointer key, gpointer val, gpointer data)
803
{
804
        GPtrArray *array = data;
805
        XFilterKeyCount2 *kc = val;
806

    
807
        g_ptr_array_add(array, kc);
808
}
809

    
810
int xfilter_bayes_db_show_contents(int verbose)
811
{
812
        XFilterBayesLearnStatus status = {0};
813
        GPtrArray *array;
814
        GHashTable *table;
815

    
816
        if (!junk_kvs || !clean_kvs) {
817
                g_warning("Database not ready");
818
                return -1;
819
        }
820

    
821
        xfilter_bayes_get_learn_status(&status);
822

    
823
        if (verbose >= 3) {
824
                int i;
825

    
826
                table = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, g_free);
827
                xfilter_kvs_foreach(junk_kvs, show_walk_func, table);
828
                xfilter_kvs_foreach(clean_kvs, show_walk_func, table);
829
                array = g_ptr_array_sized_new(g_hash_table_size(table));
830
                g_hash_table_foreach(table, kc2_walk_func, array);
831
                g_ptr_array_sort(array, key_count_compare_func);
832

    
833
                printf("All tokens:\n");
834
                printf("%-40s  junk clean     n     f_w\n", "word");
835
                printf("----------------------------------------------------------------------------\n");
836
                for (i = 0; i < array->len; i++) {
837
                        double f_w;
838
                        XFilterKeyCount2 *kc;
839

    
840
                        kc = g_ptr_array_index(array, i);
841
                        f_w = xfilter_get_prob_fisher(kc->key, &status, 1.0, 0.5, FALSE);
842
                        printf("%-40s %5d %5d %5d     %4f\n", kc->key, kc->n_junk, kc->n_clean, kc->n_junk + kc->n_clean, f_w);
843
                }
844

    
845
                g_ptr_array_free(array, TRUE);
846
                g_hash_table_destroy(table);
847
        }
848

    
849
        printf("\nStatus:\n");
850
        printf("junk_words: %d\n", status.junk_words);
851
        printf("nojunk_words: %d\n", status.nojunk_words);
852
        printf("junk_learned_num: %d\n", status.junk_learned_num);
853
        printf("nojunk_learned_num: %d\n", status.nojunk_learned_num);
854

    
855
        return 0;
856
}
857

    
858
#ifndef USE_STATUS_KVS
859
int xfilter_read_status_file(FILE *fp)
860
{
861
        char buf[1024];
862
        int n;
863
        int version;
864

    
865
        while (fgets(buf, sizeof(buf), fp) != NULL) {
866
                if (sscanf(buf, "version=%d", &n) == 1)
867
                        version = n;
868
                else if (sscanf(buf, "junk_words_sum=%d", &n) == 1)
869
                        learn_status.junk_words = n;
870
                else if (sscanf(buf, "junk_learn_count=%d", &n) == 1)
871
                        learn_status.junk_learned_num = n;
872
                else if (sscanf(buf, "clean_words_sum=%d", &n) == 1)
873
                        learn_status.nojunk_words = n;
874
                else if (sscanf(buf, "clean_learn_count=%d", &n) == 1)
875
                        learn_status.nojunk_learned_num = n;
876
        }
877

    
878
        return 0;
879
}
880
#endif
881

    
882
int xfilter_bayes_db_init(const char *path)
883
{
884
        char *file;
885

    
886
        xfilter_debug_print("xfilter_bayes_db_init: init database\n");
887
        xfilter_debug_print("xfilter_bayes_db_init: path: %s\n",
888
                            path ? path : "(current dir)");
889

    
890
        if (path) {
891
                xfilter_debug_print("xfilter_bayes_db_init: making directory: %s\n", path);
892
                if (xfilter_utils_mkdir(path) < 0) {
893
                        g_warning("Making directory failed: %s", path);
894
                        return -1;
895
                }
896
        }
897

    
898
        if (!junk_kvs) {
899
                if (path)
900
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "junk.db",
901
                                           NULL);
902
                else
903
                        file = g_strdup("junk.db");
904
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
905
                junk_kvs = xfilter_kvs_open(file);
906
                if (!junk_kvs) {
907
                        g_warning("Cannot open database: %s", file);
908
                        g_free(file);
909
                        return -1;
910
                }
911
                g_free(file);
912
        }
913
        if (!clean_kvs) {
914
                if (path)
915
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "clean.db",
916
                                           NULL);
917
                else
918
                        file = g_strdup("clean.db");
919
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
920
                clean_kvs = xfilter_kvs_open(file);
921
                if (!clean_kvs) {
922
                        g_warning("Cannot open database: %s", file);
923
                        xfilter_kvs_close(junk_kvs);
924
                        junk_kvs = NULL;
925
                        g_free(file);
926
                        return -1;
927
                }
928
                g_free(file);
929
        }
930

    
931
#ifdef USE_STATUS_KVS
932
        if (!prob_kvs) {
933
                if (path)
934
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "prob.db",
935
                                           NULL);
936
                else
937
                        file = g_strdup("prob.db");
938
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
939
                prob_kvs = xfilter_kvs_open(file);
940
                if (!prob_kvs) {
941
                        g_warning("Cannot open database: %s", file);
942
                        xfilter_kvs_close(clean_kvs);
943
                        xfilter_kvs_close(junk_kvs);
944
                        clean_kvs = NULL;
945
                        junk_kvs = NULL;
946
                        g_free(file);
947
                        return -1;
948
                }
949
                g_free(file);
950
        }
951
#else /* !USE_STATUS_KVS */
952
        if (!status_file) {
953
                FILE *status_fp;
954

    
955
                if (path)
956
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "status.dat",
957
                                           NULL);
958
                else
959
                        file = g_strdup("status.dat");
960
                xfilter_debug_print("xfilter_bayes_db_init: opening data file: %s\n", file);
961
                status_fp = g_fopen(file, "rb");
962
                if (!status_fp) {
963
                        if (ENOENT == errno)
964
                                status_fp = g_fopen(file, "wb");
965

    
966
                        if (!status_fp) {
967
                                g_warning("Cannot open data file: %s", file);
968
                                xfilter_kvs_close(clean_kvs);
969
                                xfilter_kvs_close(junk_kvs);
970
                                clean_kvs = NULL;
971
                                junk_kvs = NULL;
972
                                g_free(file);
973
                                return -1;
974
                        }
975
                } else {
976
                        xfilter_read_status_file(status_fp);
977
                }
978

    
979
                fclose(status_fp);
980

    
981
                status_file = file;
982
                status_file_tmp = g_strconcat(file, ".tmp", NULL);
983
        }
984
#endif /* !USE_STATUS_KVS */
985

    
986
        return 0;
987
}
988

    
989
int xfilter_bayes_db_done(void)
990
{
991
        int ret = 0;
992

    
993
        xfilter_debug_print("xfilter_bayes_db_init: close database\n");
994

    
995
#ifdef USE_STATUS_KVS
996
        if (prob_kvs) {
997
                ret |= xfilter_kvs_close(prob_kvs);
998
                prob_kvs = NULL;
999
        }
1000
#else
1001
        if (status_file) {
1002
                g_free(status_file_tmp);
1003
                g_free(status_file);
1004
                status_file_tmp = NULL;
1005
                status_file = NULL;
1006
        }
1007
#endif
1008

    
1009
        if (clean_kvs) {
1010
                ret |= xfilter_kvs_close(clean_kvs);
1011
                clean_kvs = NULL;
1012
        }
1013
        if (junk_kvs) {
1014
                ret |= xfilter_kvs_close(junk_kvs);
1015
                junk_kvs = NULL;
1016
        }
1017

    
1018
        return ret;
1019
}