Statistics
| Branch: | Tag: | Revision:

root / lib / filters / bayes-filter.c @ a1de4e94

History | View | Annotate | Download (24.2 KB)

1
/* SylFilter - a message filter
2
 *
3
 * Copyright (C) 2011 Hiroyuki Yamamoto
4
 * Copyright (C) 2011 Sylpheed Development Team
5
 */
6

    
7
#include <glib.h>
8
#include <stdio.h>
9
#include <string.h>
10
#include <math.h>
11
#include <unistd.h>
12
#include <errno.h>
13

    
14
#include "filter.h"
15
#include "filter-kvs.h"
16
#include "filter-utils.h"
17
#include "bayes-filter.h"
18

    
19
#ifdef BUILTIN_LIBSYLPH
20
#  include "libsylph/utils.h"
21
#else
22
#  include <sylph/utils.h>
23
#endif
24

    
25
#define N_TOKENS 15
26
#undef USE_STATUS_KVS
27

    
28
static XFilterKVS *junk_kvs;
29
static XFilterKVS *clean_kvs;
30

    
31
#ifdef USE_STATUS_KVS
32
  static XFilterKVS *prob_kvs;
33
#else
34
  static XFilterBayesLearnStatus learn_status;
35
  static char *status_file;
36
  static char *status_file_tmp;
37
#endif
38

    
39
/* Test */
40

    
41
typedef struct _XFilterBayesProbData
42
{
43
        GArray *array;
44
        XFilterBayesLearnStatus status;
45
        double robs;
46
        double robx;
47
} XFilterBayesProbData;
48

    
49
typedef struct _XFilterKeyCount
50
{
51
        const char *key;
52
        int count;
53
        double prob;
54
} XFilterKeyCount;
55

    
56
typedef struct _XFilterKeyCount2
57
{
58
        const char *key;
59
        int n_junk;
60
        int n_clean;
61
} XFilterKeyCount2;
62

    
63
static void xfilter_bayes_content_word_freq(GHashTable *table, const char *prefix, const char *text)
64
{
65
        const char *bp = text, *p = text;
66
        char *word;
67
        int count;
68

    
69
        if (!text)
70
                return;
71

    
72
        while (*p != '\0') {
73
                while (*p == ' ')
74
                        p++;
75
                bp = p;
76
                while (*p != '\0' && *p != ' ')
77
                        p++;
78
                if (p > bp) {
79
                        word = g_strndup(bp, p - bp);
80
                        if (prefix) {
81
                                char *bword = word;
82
                                word = g_strconcat(prefix, "*", bword, NULL);
83
                                g_free(bword);
84
                        }
85
                        count = GPOINTER_TO_INT(g_hash_table_lookup(table, word));
86
                        count++;
87
                        g_hash_table_insert(table, word, GINT_TO_POINTER(count));
88
                }
89
        }
90
}
91

    
92
static GHashTable *xfilter_bayes_word_freq(const XMessageData *data)
93
{
94
        GHashTable *table;
95
        const char *content;
96

    
97
        table = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
98

    
99
        content = xfilter_message_data_get_attribute(data, XM_FROM);
100
        xfilter_bayes_content_word_freq(table, "From", content);
101
        content = xfilter_message_data_get_attribute(data, XM_TO);
102
        xfilter_bayes_content_word_freq(table, "To", content);
103
        content = xfilter_message_data_get_attribute(data, XM_CC);
104
        xfilter_bayes_content_word_freq(table, "Cc", content);
105
        content = xfilter_message_data_get_attribute(data, XM_SUBJECT);
106
        xfilter_bayes_content_word_freq(table, "Subject", content);
107
        content = xfilter_message_data_get_attribute(data, XM_RECEIVED);
108
        xfilter_bayes_content_word_freq(table, "Received", content);
109

    
110
        content = xfilter_message_data_get_content(data);
111
        xfilter_bayes_content_word_freq(table, NULL, content);
112

    
113
        return table;
114
}
115

    
116
static char *get_degenerated_word(const char *word)
117
{
118
        const char *p;
119

    
120
        if (!word)
121
                return NULL;
122

    
123
        if ((p = strchr(word, '*'))) {
124
                return g_strdup(p + 1);
125
        }
126
        if ((p = strchr(word, '!'))) {
127
                if (*(p + 1) == '!')
128
                        return g_strndup(word, p + 1 - word);
129
                else
130
                        return g_strndup(word, p - word);
131
        }
132

    
133
        for (p = word; *p != '\0'; p++) {
134
                if (g_ascii_isupper(*p))
135
                        return g_ascii_strdown(word, -1);
136
        }
137

    
138
        return NULL;
139
}
140

    
141
static double xfilter_get_prob_naive(const char *key, XFilterBayesLearnStatus *status, gboolean do_degeneration)
142
{
143
        int n_junk;
144
        int n_clean;
145
        int n_junk_learn;
146
        int n_clean_learn;
147
        double prob = -1.0;
148
        double upper = 0.999;
149
        double lower = 0.001;
150
        int clean_bias = 2;
151

    
152
        //n_junk_learn = status->junk_learned_num;
153
        n_junk_learn = status->junk_words;
154
        if (n_junk_learn < 1)
155
                return -1.0;
156
        //n_clean_learn = status->nojunk_learned_num;
157
        n_clean_learn = status->nojunk_words;
158
        if (n_clean_learn < 1)
159
                return -1.0;
160

    
161
        if (xfilter_get_conf_value("no-bias") != NULL)
162
                clean_bias = 1;
163

    
164
        n_junk = xfilter_kvs_fetch_int(junk_kvs, key);
165
        n_clean = xfilter_kvs_fetch_int(clean_kvs, key) * clean_bias;
166

    
167
        if (n_junk + n_clean == 0) {
168
                if (do_degeneration) {
169
                        char *deg_key;
170

    
171
                        deg_key = get_degenerated_word(key);
172
                        if (deg_key) {
173
                                xfilter_debug_print("[degen] %s -> %s\n", key, deg_key);
174
                                prob = xfilter_get_prob_naive(deg_key, status, TRUE);
175
                                g_free(deg_key);
176
                        }
177
                }
178

    
179
                return prob;
180
        }
181

    
182
        if (n_junk + n_clean < 5) {
183
                switch (n_junk + n_clean) {
184
                case 1:
185
                        upper = 0.6; lower = 0.4; break;
186
                case 2:
187
                        upper = 0.7; lower = 0.3; break;
188
                case 3:
189
                        upper = 0.8; lower = 0.2; break;
190
                case 4:
191
                        upper = 0.9; lower = 0.1; break;
192
                }
193
        } 
194

    
195
        prob = ((double)n_junk / n_junk_learn) /
196
                (((double)n_clean / n_clean_learn) + ((double)n_junk / n_junk_learn));
197
        if (prob < lower) {
198
                if (n_junk == 0) {
199
                        if (n_clean > 10)
200
                                prob = lower;
201
                        else
202
                                prob = lower + 0.001;
203
                } else
204
                        prob = lower + 0.002;
205
        } else if (prob > upper) {
206
                if (n_clean == 0) {
207
                        if (n_junk > 10)
208
                                prob = upper;
209
                        else
210
                                prob = upper - 0.001;
211
                } else
212
                        prob = upper - 0.002;
213
        }
214

    
215
        xfilter_debug_print("%s: %4f (j: %d c: %d)\n", (gchar *)key, prob, n_junk, n_clean);
216

    
217
        return prob;
218
}
219

    
220
static void naive_test_walk_func(gpointer key, gpointer val, gpointer data)
221
{
222
        XFilterBayesProbData *pdata;
223
        XFilterKeyCount kc;
224

    
225
        pdata = (XFilterBayesProbData *)data;
226
        kc.key = (gchar *)key;
227
        kc.count = GPOINTER_TO_INT(val);
228
        kc.prob = xfilter_get_prob_naive(kc.key, &pdata->status, TRUE);
229
        //if (kc.prob > 0)
230
                //g_print("%s: (this: %d) %4f\n", kc.key, kc.count, kc.prob);
231
        if (kc.prob < 0)
232
                kc.prob = 0.4;
233
        g_array_append_val(pdata->array, kc);
234
}
235

    
236
static gint key_prob_compare_func(gconstpointer a, gconstpointer b)
237
{
238
        const XFilterKeyCount *kc1 = a;
239
        const XFilterKeyCount *kc2 = b;
240
        double da, db;
241

    
242
        da = ABS(0.5 - kc1->prob);
243
        db = ABS(0.5 - kc2->prob);
244
        return db * 10000 - da * 10000;
245
}
246

    
247
static double xfilter_get_combined_prob_naive(const XMessageData *data, XFilterBayesProbData *pdata)
248
{
249
        GHashTable *table;
250
        double prod = 1.0, prod_rev = 1.0;
251
        double cmb_prob;
252
        int i;
253

    
254
        xfilter_debug_print("\ncalculating probability for each tokens:\n");
255

    
256
        table = xfilter_bayes_word_freq(data);
257
        pdata->array = g_array_sized_new(FALSE, FALSE, sizeof(XFilterKeyCount), 128);
258

    
259
        xfilter_kvs_begin(junk_kvs);
260
        xfilter_kvs_begin(clean_kvs);
261
        g_hash_table_foreach(table, naive_test_walk_func, pdata);
262
        xfilter_kvs_end(junk_kvs);
263
        xfilter_kvs_end(clean_kvs);
264
        g_array_sort(pdata->array, key_prob_compare_func);
265

    
266
        xfilter_debug_print("\nmost interesting tokens:\n");
267
        for (i = 0; i < 15 && i < pdata->array->len; i++) {
268
                XFilterKeyCount kc = g_array_index(pdata->array, XFilterKeyCount, i);
269
                prod *= kc.prob;
270
                prod_rev *= 1 - kc.prob;
271
                xfilter_debug_print("%s: %d %4f\n", kc.key, kc.count, kc.prob);
272
        }
273

    
274
        cmb_prob = prod / (prod + prod_rev);
275
        xfilter_debug_print("\ncombined probability (Paul/Naive): %4f\n", cmb_prob);
276

    
277
        g_array_free(pdata->array, TRUE);
278
        g_hash_table_destroy(table);
279

    
280
        return cmb_prob;
281
}
282

    
283
static double xfilter_get_prob_fisher(const char *key, XFilterBayesLearnStatus *status, double s, double x, gboolean do_degeneration)
284
{
285
        int n_junk;
286
        int n_clean;
287
        int n_junk_learn;
288
        int n_clean_learn;
289
        double upper = 0.999999;
290
        double lower = 0.000001;
291
        double scalefactor;
292
        double f_w = 0.5;
293

    
294
        //n_junk_learn = status->junk_learned_num;
295
        n_junk_learn = status->junk_words;
296
        if (n_junk_learn < 1)
297
                return -1.0;
298
        //n_clean_learn = status->nojunk_learned_num;
299
        n_clean_learn = status->nojunk_words;
300
        if (n_clean_learn < 1)
301
                return -1.0;
302
        if (s < 0.01)
303
                return -1.0;
304
        if (x < 0.01 || x > 0.99)
305
                return -1.0;
306

    
307
        n_junk = xfilter_kvs_fetch_int(junk_kvs, key);
308
        n_clean = xfilter_kvs_fetch_int(clean_kvs, key);
309

    
310
        if (n_junk + n_clean == 0) {
311
                if (do_degeneration) {
312
                        char *deg_key;
313

    
314
                        deg_key = get_degenerated_word(key);
315
                        if (deg_key) {
316
                                xfilter_debug_print("[degen] %s -> %s\n", key, deg_key);
317
                                f_w = xfilter_get_prob_fisher(deg_key, status, s, x, TRUE);
318
                                g_free(deg_key);
319
                        }
320
                }
321

    
322
                return f_w;
323
        }
324

    
325
        scalefactor = (double)n_junk_learn / n_clean_learn;
326
        f_w = (s * x + n_junk) / (s + n_junk + n_clean * scalefactor);
327

    
328
        if (f_w < lower)
329
                f_w = lower;
330
        else if (f_w > upper)
331
                f_w = upper;
332

    
333
        xfilter_debug_print("%s: %4f (j: %d c: %d)\n", (gchar *)key, f_w, n_junk, n_clean);
334

    
335
        return f_w;
336
}
337

    
338
static void fisher_test_walk_func(gpointer key, gpointer val, gpointer data)
339
{
340
        XFilterBayesProbData *pdata;
341
        XFilterKeyCount kc;
342

    
343
        pdata = (XFilterBayesProbData *)data;
344
        kc.key = (gchar *)key;
345
        kc.count = GPOINTER_TO_INT(val);
346
        kc.prob = xfilter_get_prob_fisher(kc.key, &pdata->status, pdata->robs, pdata->robx, TRUE);
347
        if (kc.prob < 0)
348
                kc.prob = 0.5;
349
        g_array_append_val(pdata->array, kc);
350
}
351

    
352
/* inverse chi-squared function */
353
static double chi2q(double x2, double v)
354
{
355
        double m;
356
        double sum;
357
        double term;
358
        int i;
359

    
360
        m = x2 / 2.0;
361
        sum = term = exp(0.0 - m);
362

    
363
        for (i = 1; i < (v / 2) - 1; i++) {
364
                term *= m / i;
365
                sum += term;
366
        }
367

    
368
        return sum < 1.0 ? sum : 1.0;
369
}
370

    
371
static double xfilter_get_combined_prob_fisher(const XMessageData *data, XFilterBayesProbData *pdata)
372
{
373
        GHashTable *table;
374
        const char *val;
375
        char *p;
376
        double sum = 0.0, sum_rev = 0.0;
377
        int count = 0;
378
        double P, Q;
379
        int N;
380
        int i;
381
        double min_dev = 0.1;
382
        double s = 1.0;
383
        double x = 0.5;
384
        double cmb_prob;
385

    
386
        xfilter_debug_print("\ncalculating probability for each tokens:\n");
387

    
388
        val = xfilter_get_conf_value("min-dev");
389
        if (val) {
390
                min_dev = strtod(val, &p);
391
                if (p == val)
392
                        min_dev = 0.1;
393
                else if (min_dev > 0.499)
394
                        min_dev = 0.499;
395
        }
396
        val = xfilter_get_conf_value("robs");
397
        if (val) {
398
                s = strtod(val, &p);
399
                if (p == val)
400
                        s = 1.0;
401
                else if (s < 0.01)
402
                        s = 0.01;
403
                else if (s > 1.0)
404
                        s = 1.0;
405
        }
406
        val = xfilter_get_conf_value("robx");
407
        if (val) {
408
                x = strtod(val, &p);
409
                if (p == val)
410
                        x = 0.5;
411
                else if (x < 0.01)
412
                        x = 0.01;
413
                else if (x > 0.99)
414
                        x = 0.99;
415
        }
416

    
417
        table = xfilter_bayes_word_freq(data);
418
        pdata->array = g_array_sized_new(FALSE, FALSE, sizeof(XFilterKeyCount), 128);
419
        pdata->robs = s;
420
        pdata->robx = x;
421

    
422
        xfilter_kvs_begin(junk_kvs);
423
        xfilter_kvs_begin(clean_kvs);
424
        g_hash_table_foreach(table, fisher_test_walk_func, pdata);
425
        xfilter_kvs_end(junk_kvs);
426
        xfilter_kvs_end(clean_kvs);
427

    
428
        xfilter_debug_print("\ninteresting tokens:\n");
429
        for (i = 0; i < pdata->array->len; i++) {
430
                XFilterKeyCount kc = g_array_index(pdata->array, XFilterKeyCount, i);
431
                if (ABS(kc.prob - 0.5) > min_dev) {
432
                        sum_rev += log(1 - kc.prob);
433
                        sum += log(kc.prob);
434
                        count++;
435
                        xfilter_debug_print("%s: %d %4f\n", kc.key, kc.count, kc.prob);
436
                }
437
        }
438

    
439
        N = count;
440
        P = chi2q(-2 * sum_rev, 2 * N);
441
        Q = chi2q(-2 * sum, 2 * N);
442
        cmb_prob = (1 + Q - P) / 2;
443
        xfilter_debug_print("\ncombined probability (Robinson-Fisher): %4f (min_dev: %f, s: %f, x: %f, N: %d, P = %f, Q = %f\n", cmb_prob, min_dev, s, x, N, P, Q);
444

    
445
        g_array_free(pdata->array, TRUE);
446
        g_hash_table_destroy(table);
447

    
448
        return cmb_prob;
449
}
450

    
451
static XFilterStatus xfilter_bayes_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
452
{
453
        const char *type;
454
        XFilterBayesProbData pdata;
455
        double cmb_prob;
456
        XFilterStatus status;
457
        const char *method;
458

    
459
        g_return_val_if_fail(result != NULL, XF_ERROR);
460

    
461
        type = xfilter_message_data_get_mime_type(data);
462
        if (!type || g_strncasecmp(type, "text/", 5) != 0) {
463
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
464
                return XF_UNSUPPORTED_TYPE;
465
        }
466

    
467
        if (!junk_kvs) {
468
                g_warning("Cannot open junk database");
469
                xfilter_result_set_status(result, XF_ERROR);
470
                return XF_ERROR;
471
        }
472

    
473
        xfilter_debug_print("bayes-guessing message\n");
474

    
475
        method = xfilter_get_conf_value("method");
476

    
477
        xfilter_bayes_get_learn_status(&pdata.status);
478
        if (pdata.status.junk_learned_num < 1) {
479
                xfilter_debug_print("junk message not learned yet\n");
480
                cmb_prob = 0.5;
481
        } else if (pdata.status.nojunk_learned_num < 1) {
482
                xfilter_debug_print("clean message not learned yet\n");
483
                cmb_prob = 0.5;
484
        } else {
485
                if (method && method[0] == 'n')
486
                        cmb_prob = xfilter_get_combined_prob_naive(data, &pdata);
487
                else
488
                        cmb_prob = xfilter_get_combined_prob_fisher(data, &pdata);
489
        }
490

    
491
        xfilter_result_set_probability(result, cmb_prob);
492
        if (cmb_prob > 0.90)
493
                status = XF_JUNK;
494
        else if (cmb_prob < 0.10)
495
                status = XF_NOJUNK;
496
        else
497
                status = XF_UNCERTAIN;
498
        xfilter_result_set_status(result, status);
499
        
500
        return status;
501
}
502

    
503
XFilter *xfilter_bayes_new(void)
504
{
505
        XFilter *filter;
506

    
507
        filter = xfilter_new(XF_TEST, "bayes-test");
508
        xfilter_set_test_filter_func(X_TEST_FILTER(filter), xfilter_bayes_func);
509

    
510
        return filter;
511
}
512

    
513

    
514
/* Learning */
515

    
516
typedef struct _XFilterLearnWalkData
517
{
518
        XFilterKVS *kvs;
519
        int sum;
520
} XFilterLearnWalkData;
521

    
522
static void learn_walk_func(gpointer key, gpointer val, gpointer data)
523
{
524
        XFilterLearnWalkData *lwd = (XFilterLearnWalkData *)data;
525

    
526
        //g_print("%s: %d (%s)\n", (gchar *)key, GPOINTER_TO_INT(val), kvs == junk_kvs ? "j" : "c");
527
        if (xfilter_kvs_increment(lwd->kvs, (gchar *)key, GPOINTER_TO_INT(val)) < 0)
528
                g_warning("database update error");
529
        lwd->sum += GPOINTER_TO_INT(val);
530
}
531

    
532
static void unlearn_walk_func(gpointer key, gpointer val, gpointer data)
533
{
534
        XFilterKVS *kvs = (XFilterKVS *)data;
535

    
536
        //g_print("%s: %d (%s)\n", (gchar *)key, GPOINTER_TO_INT(val), kvs == junk_kvs ? "j" : "c");
537
        if (xfilter_kvs_decrement(kvs, (gchar *)key, GPOINTER_TO_INT(val)) < 0)
538
                g_warning("database update error");
539
}
540

    
541
static int xfilter_update_status(gboolean is_junk, gboolean is_register, int sum_add)
542
{
543
#ifdef USE_STATUS_KVS
544
        xfilter_kvs_begin(prob_kvs);
545
        if (is_register) {
546
                if (is_junk) {
547
                        xfilter_kvs_increment(prob_kvs, "@junk_words_sum", sum_add);
548
                        xfilter_kvs_increment(prob_kvs, "@junk_learn_count", 1);
549
                } else {
550
                        xfilter_kvs_increment(prob_kvs, "@clean_words_sum", sum_add);
551
                        xfilter_kvs_increment(prob_kvs, "@clean_learn_count", 1);
552
                }
553
        } else {
554
                if (is_junk) {
555
                        xfilter_kvs_set_int(prob_kvs, "@junk_words_sum", sum_add);
556
                        xfilter_kvs_decrement(prob_kvs, "@junk_learn_count", 1);
557
                } else {
558
                        xfilter_kvs_set_int(prob_kvs, "@clean_words_sum", sum_add);
559
                        xfilter_kvs_decrement(prob_kvs, "@clean_learn_count", 1);
560
                }
561
        }
562
        xfilter_kvs_end(prob_kvs);
563

    
564
        return 0;
565
#else /* !USE_STATUS_KVS */
566
        FILE *status_fp;
567

    
568
        if (is_register) {
569
                if (is_junk) {
570
                        learn_status.junk_words += sum_add;
571
                        learn_status.junk_learned_num++;
572
                } else {
573
                        learn_status.nojunk_words += sum_add;
574
                        learn_status.nojunk_learned_num++;
575
                }
576
        } else {
577
                if (is_junk) {
578
                        learn_status.junk_words = sum_add;
579
                        if (learn_status.junk_learned_num > 0)
580
                                learn_status.junk_learned_num--;
581
                } else {
582
                        learn_status.nojunk_words = sum_add;
583
                        if (learn_status.nojunk_learned_num > 0)
584
                                learn_status.nojunk_learned_num--;
585
                }
586
        }
587

    
588
        xfilter_debug_print("xfilter_update_status: writing status to file\n");
589

    
590
        status_fp = g_fopen(status_file_tmp, "wb");
591
        if (!status_fp) {
592
                perror("fopen");
593
                return -1;
594
        }
595
        fprintf(status_fp,
596
                "version=1\n"
597
                "junk_words_sum=%d\n"
598
                "junk_learn_count=%d\n"
599
                "clean_words_sum=%d\n"
600
                "clean_learn_count=%d\n",
601
                learn_status.junk_words,
602
                learn_status.junk_learned_num,
603
                learn_status.nojunk_words,
604
                learn_status.nojunk_learned_num);
605

    
606
        if (fflush(status_fp) < 0) {
607
                perror("fflush");
608
                fclose(status_fp);
609
                g_unlink(status_file_tmp);
610
                return -1;
611
        }
612
#if HAVE_FSYNC
613
        if (fsync(fileno(status_fp)) < 0) {
614
                perror("fsync");
615
        }
616
#elif defined(G_OS_WIN32)
617
        if (_commit(_fileno(status_fp)) < 0) {
618
                perror("_commit");
619
        }
620
#endif
621
        fclose(status_fp);
622
        if (rename_force(status_file_tmp, status_file) < 0) {
623
                perror("rename");
624
                return -1;
625
        }
626

    
627
        xfilter_debug_print("xfilter_update_status: done\n");
628

    
629
        return 0;
630
#endif /* !USE_STATUS_KVS */
631
}
632

    
633
static XFilterStatus xfilter_bayes_learn(XFilter *filter, const XMessageData *data, XFilterResult *result, gboolean is_junk, gboolean is_register)
634
{
635
        const char *type;
636
        GHashTable *table;
637
        XFilterKVS *kvs;
638
        int sum_add;
639

    
640
        g_return_val_if_fail(result != NULL, XF_ERROR);
641

    
642
        type = xfilter_message_data_get_mime_type(data);
643
        if (!type || g_strncasecmp(type, "text/", 5) != 0) {
644
                xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
645
                return XF_UNSUPPORTED_TYPE;
646
        }
647

    
648
        if (is_junk)
649
                kvs = junk_kvs;
650
        else
651
                kvs = clean_kvs;
652
        if (!kvs) {
653
                g_warning("xfilter_bayes_learn: Cannot open database");
654
                xfilter_result_set_status(result, XF_ERROR);
655
                return XF_ERROR;
656
        }
657

    
658
        xfilter_debug_print("%slearning %s message\n", is_register ? "" : "un", is_junk ? "junk" : "clean");
659

    
660
        table = xfilter_bayes_word_freq(data);
661
        xfilter_kvs_begin(kvs);
662
        if (is_register) {
663
                XFilterLearnWalkData lwd = {kvs, 0};
664

    
665
                g_hash_table_foreach(table, learn_walk_func, &lwd);
666
                sum_add = lwd.sum;
667
        } else {
668
                g_hash_table_foreach(table, unlearn_walk_func, kvs);
669
                sum_add = xfilter_kvs_count_sum(kvs);
670
        }
671
        xfilter_kvs_end(kvs);
672
        g_hash_table_destroy(table);
673

    
674
        xfilter_update_status(is_junk, is_register, sum_add);
675

    
676
        xfilter_result_set_status(result, XF_NONE);
677

    
678
        return XF_NONE;
679
}
680

    
681
static XFilterStatus xfilter_bayes_learn_junk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
682
{
683
        return xfilter_bayes_learn(filter, data, result, TRUE, TRUE);
684
}
685

    
686
static XFilterStatus xfilter_bayes_learn_nojunk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
687
{
688
        return xfilter_bayes_learn(filter, data, result, FALSE, TRUE);
689
}
690

    
691
static XFilterStatus xfilter_bayes_unlearn_junk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
692
{
693
        return xfilter_bayes_learn(filter, data, result, TRUE, FALSE);
694
}
695

    
696
static XFilterStatus xfilter_bayes_unlearn_nojunk_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
697
{
698
        return xfilter_bayes_learn(filter, data, result, FALSE, FALSE);
699
}
700

    
701
XFilter *xfilter_bayes_learn_junk_new(void)
702
{
703
        XFilter *filter;
704

    
705
        filter = xfilter_new(XF_CONTENT, "bayes-learn-junk");
706
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_learn_junk_func);
707

    
708
        return filter;
709
}
710

    
711
XFilter *xfilter_bayes_learn_nojunk_new(void)
712
{
713
        XFilter *filter;
714

    
715
        filter = xfilter_new(XF_CONTENT, "bayes-learn-clean");
716
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_learn_nojunk_func);
717

    
718
        return filter;
719
}
720

    
721
XFilter *xfilter_bayes_unlearn_junk_new(void)
722
{
723
        XFilter *filter;
724

    
725
        filter = xfilter_new(XF_CONTENT, "bayes-unlearn-junk");
726
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_unlearn_junk_func);
727

    
728
        return filter;
729
}
730

    
731
XFilter *xfilter_bayes_unlearn_nojunk_new(void)
732
{
733
        XFilter *filter;
734

    
735
        filter = xfilter_new(XF_CONTENT, "bayes-unlearn-clean");
736
        xfilter_set_content_filter_func(X_CONTENT_FILTER(filter), xfilter_bayes_unlearn_nojunk_func);
737

    
738
        return filter;
739
}
740

    
741

    
742
int xfilter_bayes_get_learn_status(XFilterBayesLearnStatus *status)
743
{
744
        g_return_val_if_fail(status != NULL, -1);
745

    
746
#ifdef USE_STATUS_KVS
747
        status->junk_words = xfilter_kvs_fetch_int(prob_kvs, "@junk_words_sum");
748
        status->nojunk_words = xfilter_kvs_fetch_int(prob_kvs, "@clean_words_sum");
749
        status->junk_learned_num = xfilter_kvs_fetch_int(prob_kvs, "@junk_learn_count");
750
        status->nojunk_learned_num = xfilter_kvs_fetch_int(prob_kvs, "@clean_learn_count");
751
#else
752
        *status = learn_status;
753
#endif
754

    
755
        return 0;
756
}
757

    
758
int xfilter_bayes_reset_learn_count(void)
759
{
760
        return 0;
761
}
762

    
763
int xfilter_bayes_reset_all(void)
764
{
765
        return 0;
766
}
767

    
768
static int show_walk_func(XFilterKVS *kvs, const char *key, void *value, int size, void *data)
769
{
770
        int ival;
771
        GHashTable *table = (GHashTable *)data;
772
        XFilterKeyCount2 *kc;
773

    
774
        if (size == 4) {
775
                ival = *(gint32 *)value;
776
                //printf("%s: %d\n", key, ival);
777
                kc = g_hash_table_lookup(table, key);
778
                if (!kc) {
779
                        kc = g_new0(XFilterKeyCount2, 1);
780
                        kc->key = g_strdup(key);
781
                        g_hash_table_insert(table, (char *)kc->key, kc);
782
                }
783
                if (kvs == junk_kvs)
784
                        kc->n_junk = ival;
785
                else
786
                        kc->n_clean = ival;
787
        }
788

    
789
        return 0;
790
}
791

    
792
static gint key_count_compare_func(gconstpointer a, gconstpointer b)
793
{
794
        const XFilterKeyCount2 *kc1 = *(XFilterKeyCount2 **)a;
795
        const XFilterKeyCount2 *kc2 = *(XFilterKeyCount2 **)b;
796

    
797
        return (kc2->n_junk + kc2->n_clean) - (kc1->n_junk + kc1->n_clean);
798
}
799

    
800
static void kc2_walk_func(gpointer key, gpointer val, gpointer data)
801
{
802
        GPtrArray *array = data;
803
        XFilterKeyCount2 *kc = val;
804

    
805
        g_ptr_array_add(array, kc);
806
}
807

    
808
int xfilter_bayes_db_show_contents(int verbose)
809
{
810
        XFilterBayesLearnStatus status = {0};
811
        GPtrArray *array;
812
        GHashTable *table;
813

    
814
        if (!junk_kvs || !clean_kvs) {
815
                g_warning("Database not ready");
816
                return -1;
817
        }
818

    
819
        xfilter_bayes_get_learn_status(&status);
820

    
821
        if (verbose >= 3) {
822
                int i;
823

    
824
                table = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, g_free);
825
                xfilter_kvs_foreach(junk_kvs, show_walk_func, table);
826
                xfilter_kvs_foreach(clean_kvs, show_walk_func, table);
827
                array = g_ptr_array_sized_new(g_hash_table_size(table));
828
                g_hash_table_foreach(table, kc2_walk_func, array);
829
                g_ptr_array_sort(array, key_count_compare_func);
830

    
831
                printf("All tokens:\n");
832
                printf("%-40s  junk clean     n     f_w\n", "word");
833
                printf("----------------------------------------------------------------------------\n");
834
                for (i = 0; i < array->len; i++) {
835
                        double f_w;
836
                        XFilterKeyCount2 *kc;
837

    
838
                        kc = g_ptr_array_index(array, i);
839
                        f_w = xfilter_get_prob_fisher(kc->key, &status, 1.0, 0.5, FALSE);
840
                        printf("%-40s %5d %5d %5d     %4f\n", kc->key, kc->n_junk, kc->n_clean, kc->n_junk + kc->n_clean, f_w);
841
                }
842

    
843
                g_ptr_array_free(array, TRUE);
844
                g_hash_table_destroy(table);
845
        }
846

    
847
        printf("\nStatus:\n");
848
        printf("junk_words: %d\n", status.junk_words);
849
        printf("nojunk_words: %d\n", status.nojunk_words);
850
        printf("junk_learned_num: %d\n", status.junk_learned_num);
851
        printf("nojunk_learned_num: %d\n", status.nojunk_learned_num);
852

    
853
        return 0;
854
}
855

    
856
#ifndef USE_STATUS_KVS
857
int xfilter_read_status_file(FILE *fp)
858
{
859
        char buf[1024];
860
        int n;
861
        int version;
862

    
863
        while (fgets(buf, sizeof(buf), fp) != NULL) {
864
                if (sscanf(buf, "version=%d", &n) == 1)
865
                        version = n;
866
                else if (sscanf(buf, "junk_words_sum=%d", &n) == 1)
867
                        learn_status.junk_words = n;
868
                else if (sscanf(buf, "junk_learn_count=%d", &n) == 1)
869
                        learn_status.junk_learned_num = n;
870
                else if (sscanf(buf, "clean_words_sum=%d", &n) == 1)
871
                        learn_status.nojunk_words = n;
872
                else if (sscanf(buf, "clean_learn_count=%d", &n) == 1)
873
                        learn_status.nojunk_learned_num = n;
874
        }
875

    
876
        return 0;
877
}
878
#endif
879

    
880
int xfilter_bayes_db_init(const char *path)
881
{
882
        char *file;
883

    
884
        xfilter_debug_print("xfilter_bayes_db_init: init database\n");
885
        xfilter_debug_print("xfilter_bayes_db_init: path: %s\n",
886
                            path ? path : "(current dir)");
887

    
888
        if (path) {
889
                xfilter_debug_print("xfilter_bayes_db_init: making directory: %s\n", path);
890
                if (xfilter_utils_mkdir(path) < 0) {
891
                        g_warning("Making directory failed: %s", path);
892
                        return -1;
893
                }
894
        }
895

    
896
        if (!junk_kvs) {
897
                if (path)
898
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "junk.db",
899
                                           NULL);
900
                else
901
                        file = g_strdup("junk.db");
902
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
903
                junk_kvs = xfilter_kvs_open(file);
904
                if (!junk_kvs) {
905
                        g_warning("Cannot open database: %s", file);
906
                        g_free(file);
907
                        return -1;
908
                }
909
                g_free(file);
910
        }
911
        if (!clean_kvs) {
912
                if (path)
913
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "clean.db",
914
                                           NULL);
915
                else
916
                        file = g_strdup("clean.db");
917
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
918
                clean_kvs = xfilter_kvs_open(file);
919
                if (!clean_kvs) {
920
                        g_warning("Cannot open database: %s", file);
921
                        xfilter_kvs_close(junk_kvs);
922
                        junk_kvs = NULL;
923
                        g_free(file);
924
                        return -1;
925
                }
926
                g_free(file);
927
        }
928

    
929
#ifdef USE_STATUS_KVS
930
        if (!prob_kvs) {
931
                if (path)
932
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "prob.db",
933
                                           NULL);
934
                else
935
                        file = g_strdup("prob.db");
936
                xfilter_debug_print("xfilter_bayes_db_init: opening db: %s\n", file);
937
                prob_kvs = xfilter_kvs_open(file);
938
                if (!prob_kvs) {
939
                        g_warning("Cannot open database: %s", file);
940
                        xfilter_kvs_close(clean_kvs);
941
                        xfilter_kvs_close(junk_kvs);
942
                        clean_kvs = NULL;
943
                        junk_kvs = NULL;
944
                        g_free(file);
945
                        return -1;
946
                }
947
                g_free(file);
948
        }
949
#else /* !USE_STATUS_KVS */
950
        if (!status_file) {
951
                FILE *status_fp;
952

    
953
                if (path)
954
                        file = g_strconcat(path, G_DIR_SEPARATOR_S, "status.dat",
955
                                           NULL);
956
                else
957
                        file = g_strdup("status.dat");
958
                xfilter_debug_print("xfilter_bayes_db_init: opening data file: %s\n", file);
959
                status_fp = g_fopen(file, "rb");
960
                if (!status_fp) {
961
                        if (ENOENT == errno)
962
                                status_fp = g_fopen(file, "wb");
963

    
964
                        if (!status_fp) {
965
                                g_warning("Cannot open data file: %s", file);
966
                                xfilter_kvs_close(clean_kvs);
967
                                xfilter_kvs_close(junk_kvs);
968
                                clean_kvs = NULL;
969
                                junk_kvs = NULL;
970
                                g_free(file);
971
                                return -1;
972
                        }
973
                } else {
974
                        xfilter_read_status_file(status_fp);
975
                }
976

    
977
                fclose(status_fp);
978

    
979
                status_file = file;
980
                status_file_tmp = g_strconcat(file, ".tmp", NULL);
981
        }
982
#endif /* !USE_STATUS_KVS */
983

    
984
        return 0;
985
}
986

    
987
int xfilter_bayes_db_done(void)
988
{
989
        int ret = 0;
990

    
991
        xfilter_debug_print("xfilter_bayes_db_init: close database\n");
992

    
993
#ifdef USE_STATUS_KVS
994
        if (prob_kvs) {
995
                ret |= xfilter_kvs_close(prob_kvs);
996
                prob_kvs = NULL;
997
        }
998
#else
999
        if (status_file) {
1000
                g_free(status_file_tmp);
1001
                g_free(status_file);
1002
                status_file_tmp = NULL;
1003
                status_file = NULL;
1004
        }
1005
#endif
1006

    
1007
        if (clean_kvs) {
1008
                ret |= xfilter_kvs_close(clean_kvs);
1009
                clean_kvs = NULL;
1010
        }
1011
        if (junk_kvs) {
1012
                ret |= xfilter_kvs_close(junk_kvs);
1013
                junk_kvs = NULL;
1014
        }
1015

    
1016
        return ret;
1017
}