Statistics
| Branch: | Tag: | Revision:

root / src / sylfilter.c @ b60592cc

History | View | Annotate | Download (11.3 KB)

1
/* SylFilter - a message filter
2
 *
3
 * Copyright (C) 2011 Hiroyuki Yamamoto
4
 * Copyright (C) 2011 Sylpheed Development Team
5
 */
6

    
7
#include "config.h"
8

    
9
#include <stdio.h>
10
#include <string.h>
11
#include <locale.h>
12

    
13
#include "filter.h"
14
#include "filter-manager.h"
15
#include "filter-utils.h"
16
#include "filter-kvs.h"
17

    
18
#ifdef USE_QDBM
19
#  include "filter-kvs-qdbm.h"
20
#endif
21
#ifdef USE_SQLITE
22
#  include "filter-kvs-sqlite.h"
23
#endif
24
#ifdef USE_GDBM
25
#  include "filter-kvs-gdbm.h"
26
#endif
27

    
28
#include "textcontent-filter.h"
29
#include "blacklist-filter.h"
30
#include "whitelist-filter.h"
31
#include "wordsep-filter.h"
32
#include "bayes-filter.h"
33

    
34
enum {
35
        MODE_TEST_JUNK,
36
        MODE_LEARN,
37
        MODE_SHOW_STATUS
38
};
39

    
40
enum {
41
        MODE_LEARN_NONE = 0,
42
        MODE_LEARN_JUNK = 1,
43
        MODE_LEARN_CLEAN = 1 << 1,
44
        MODE_UNLEARN_JUNK = 1 << 2,
45
        MODE_UNLEARN_CLEAN = 1 << 3,
46
};
47

    
48
static int verbose = 0;
49

    
50
static int learn_filter(int mode, const char *file);
51
static int test_filter(int mode, const char *file);
52
static void print_message_data(XMessageData *msgdata);
53
static void usage(void);
54

    
55

    
56
int main(int argc, char *argv[])
57
{
58
        int retval = 2;
59
        int i;
60
        int mode = MODE_TEST_JUNK;
61
        int learn_mode = MODE_LEARN_NONE;
62
        int no_bias = 0;
63
        const char *method = NULL;
64
        const char *min_dev = NULL;
65
        const char *robs = NULL;
66
        const char *robx = NULL;
67
        int count = 0;
68
        const char *dbpath = NULL;
69
#ifdef USE_QDBM
70
        const char *engine = "qdbm";
71
#elif defined(USE_SQLITE)
72
        const char *engine = "sqlite";
73
#elif defined(USE_GDBM)
74
        const char *engine = "gdbm";
75
#endif
76

    
77
        setlocale(LC_ALL, "");
78

    
79
        for (i = 1; i < argc; i++) {
80
                if (!strcmp(argv[i], "-j")) {
81
                        mode = MODE_LEARN;
82
                        learn_mode |= MODE_LEARN_JUNK;
83
                } else if (!strcmp(argv[i], "-c")) {
84
                        mode = MODE_LEARN;
85
                        learn_mode |= MODE_LEARN_CLEAN;
86
                } if (!strcmp(argv[i], "-J")) {
87
                        mode = MODE_LEARN;
88
                        learn_mode |= MODE_UNLEARN_JUNK;
89
                } else if (!strcmp(argv[i], "-C")) {
90
                        mode = MODE_LEARN;
91
                        learn_mode |= MODE_UNLEARN_CLEAN;
92
                } else if (!strcmp(argv[i], "-t"))
93
                        mode = MODE_TEST_JUNK;
94
                else if (!strcmp(argv[i], "-s"))
95
                        mode = MODE_SHOW_STATUS;
96
                else if (!strncmp(argv[i], "-v", 2)) {
97
                        verbose = 1;
98
                        if (argv[i][2] == 'v') {
99
                                verbose++;
100
                                if (argv[i][3] == 'v')
101
                                        verbose++;
102
                        }
103
                } else if (!strcmp(argv[i], "-d"))
104
                        xfilter_set_debug_mode(1);
105
                else if (!strcmp(argv[i], "-E")) {
106
                        i++;
107
                        if (i >= argc) {
108
                                usage();
109
                                return 1;
110
                        }
111
                        engine = argv[i];
112
                } else if (!strcmp(argv[i], "-B")) {
113
                        no_bias = 1;
114
                } else if (!strcmp(argv[i], "-m")) {
115
                        i++;
116
                        if (i >= argc) {
117
                                usage();
118
                                return 1;
119
                        }
120
                        if (argv[i][0] == 'n')
121
                                method = "n";
122
                } else if (!strcmp(argv[i], "-p")) {
123
                        i++;
124
                        if (i >= argc) {
125
                                usage();
126
                                return 1;
127
                        }
128
                        dbpath = argv[i];
129
                } else if (argv[i][0] == '-' && argv[i][1] == '-') {
130
                        if (!strcmp(argv[i] + 2, "min-dev")) {
131
                                i++;
132
                                if (i >= argc) {
133
                                        usage();
134
                                        return 1;
135
                                }
136
                                min_dev = argv[i];
137
                        } else if (!strcmp(argv[i] + 2, "robs")) {
138
                                i++;
139
                                if (i >= argc) {
140
                                        usage();
141
                                        return 1;
142
                                }
143
                                robs = argv[i];
144
                        } else if (!strcmp(argv[i] + 2, "robx")) {
145
                                i++;
146
                                if (i >= argc) {
147
                                        usage();
148
                                        return 1;
149
                                }
150
                                robx = argv[i];
151
                        } else if (!strncmp(argv[i] + 2, "help", 4)) {
152
                                usage();
153
                                return 0;
154
                        }
155
                } else if (!strcmp(argv[i], "-h")) {
156
                        usage();
157
                        return 0;
158
                }
159
        }
160

    
161
        xfilter_init(XF_APP_MODE_STANDALONE);
162

    
163
#ifdef USE_QDBM
164
        if (!strcasecmp(engine, "qdbm"))
165
                xfilter_kvs_qdbm_set_engine();
166
#else
167
        if (0) {}
168
#endif
169
#ifdef USE_SQLITE
170
        else if (!strcasecmp(engine, "sqlite"))
171
                xfilter_kvs_sqlite_set_engine();
172
#else
173
        else if (0) {}
174
#endif
175
#ifdef USE_GDBM
176
        else if (!strcasecmp(engine, "gdbm"))
177
                xfilter_kvs_gdbm_set_engine();
178
#else
179
        else if (0) {}
180
#endif
181
        else {
182
                fprintf(stderr, "Engine '%s' not supported.\n", engine);
183
                xfilter_done();
184
                return 127;
185
        }
186

    
187
        if (verbose)
188
                printf("engine %s has been selected\n", engine);
189

    
190
        /* set global config values */
191
        if (no_bias)
192
                xfilter_set_conf_value("no-bias", "t");
193
        if (method)
194
                xfilter_set_conf_value("method", method);
195
        if (min_dev)
196
                xfilter_set_conf_value("min-dev", min_dev);
197
        if (robs)
198
                xfilter_set_conf_value("robs", robs);
199
        if (robx)
200
                xfilter_set_conf_value("robx", robx);
201

    
202
        if (xfilter_utils_set_base_dir(dbpath) < 0) {
203
                fprintf(stderr, "Could not create base directory.\n");
204
                xfilter_done();
205
                return 127;
206
        }
207

    
208
        dbpath = xfilter_utils_get_base_dir();
209
        if (xfilter_bayes_db_init(dbpath) < 0) {
210
                fprintf(stderr, "Database initialization error.\n");
211
                xfilter_done();
212
                return 127;
213
        }
214

    
215
#define ARGV_OPTION_WITH_PARAM(arg)                                \
216
        (!strcmp(arg, "-E") || !strcmp(arg, "-m") ||                \
217
         !strcmp(arg, "-p") ||                                        \
218
         !strcmp(arg, "--min-dev") || !strcmp(arg, "--robs") ||        \
219
         !strcmp(arg, "--robx"))
220

    
221
        if (mode == MODE_SHOW_STATUS) {
222
                retval = xfilter_bayes_db_show_contents(verbose);
223
        } else if (mode == MODE_LEARN) {
224
                for (i = 1; i < argc; i++) {
225
                        if (ARGV_OPTION_WITH_PARAM(argv[i])) {
226
                                i++;
227
                                if (i >= argc)
228
                                        break;
229
                        } else if (argv[i][0] != '-') {
230
                                retval = learn_filter(learn_mode, argv[i]);
231
                                if (retval != 0)
232
                                        break;
233
                                count++;
234
                        }
235
                }
236
        } else {
237
                for (i = 1; i < argc; i++) {
238
                        if (ARGV_OPTION_WITH_PARAM(argv[i])) {
239
                                i++;
240
                                if (i >= argc)
241
                                        break;
242
                        } else if (argv[i][0] != '-') {
243
                                retval = test_filter(mode, argv[i]);
244
                                if (retval == 127)
245
                                        break;
246
                                count++;
247
                        }
248
                }
249
        }
250

    
251
        xfilter_bayes_db_done();
252
        xfilter_done();
253

    
254
        if (mode != MODE_SHOW_STATUS && count == 0)
255
                fprintf(stderr, "No input file.\n");
256

    
257
        if (verbose)
258
                printf("return value: %d\n", retval);
259

    
260
        return retval;
261
}
262

    
263
static int learn_filter(int mode, const char *file)
264
{
265
        XFilterManager *mgr;
266
        XMessageData *msgdata;
267
        XMessageData *resdata;
268
        XFilterResult *res;
269
        XFilterStatus status;
270
        int retval = 0;
271

    
272
        if (verbose)
273
                printf("learning message file: %s\n", file);
274

    
275
        if ((mode & (MODE_LEARN_JUNK | MODE_LEARN_CLEAN | MODE_UNLEARN_JUNK | MODE_UNLEARN_CLEAN)) == 0) {
276
                fprintf(stderr, "no learn mode specified\n");
277
                return 2;
278
        }
279
        if ((mode & (MODE_LEARN_JUNK | MODE_UNLEARN_CLEAN)) != 0 &&
280
            (mode & (MODE_LEARN_CLEAN | MODE_UNLEARN_JUNK)) != 0) {
281
                fprintf(stderr, "-j/-C and -c/-J cannot be specified at the same time\n");
282
                return 2;
283
        }
284
        if ((mode & (MODE_LEARN_CLEAN | MODE_UNLEARN_JUNK)) != 0 &&
285
            (mode & (MODE_LEARN_JUNK | MODE_UNLEARN_CLEAN)) != 0) {
286
                fprintf(stderr, "-c/-J and -j/-C cannot be specified at the same time\n");
287
                return 2;
288
        }
289

    
290
        mgr = xfilter_manager_new();
291
        xfilter_manager_filter_add(mgr, xfilter_textcontent_new());
292
        xfilter_manager_filter_add(mgr, xfilter_wordsep_new());
293

    
294
        if (mode & MODE_LEARN_JUNK)
295
                xfilter_manager_filter_add(mgr, xfilter_bayes_learn_junk_new());
296
        if (mode & MODE_LEARN_CLEAN)
297
                xfilter_manager_filter_add(mgr, xfilter_bayes_learn_nojunk_new());
298
        if (mode & MODE_UNLEARN_JUNK)
299
                xfilter_manager_filter_add(mgr, xfilter_bayes_unlearn_junk_new());
300
        if (mode & MODE_UNLEARN_CLEAN)
301
                xfilter_manager_filter_add(mgr, xfilter_bayes_unlearn_nojunk_new());
302

    
303
        msgdata = xfilter_message_data_read_file(file, "message/rfc822");
304

    
305
        res = xfilter_manager_run(mgr, msgdata);
306
        if (verbose)
307
                xfilter_result_print(res);
308
        status = xfilter_result_get_status(res);
309
        if (status == XF_UNSUPPORTED_TYPE || status == XF_ERROR) {
310
                fprintf(stderr, "%s: Error on learning mail\n", file);
311
                retval = 127;
312
        }
313

    
314
        if (xfilter_get_debug_mode()) {
315
                resdata = xfilter_result_get_message_data(res);
316
                print_message_data(resdata);
317
        }
318

    
319
        xfilter_result_free(res);
320
        xfilter_message_data_free(msgdata);
321
        xfilter_manager_free(mgr);
322

    
323
        return retval;
324
}
325

    
326
static int test_filter(int mode, const char *file)
327
{
328
        XFilterManager *mgr;
329
        XMessageData *msgdata;
330
        XMessageData *resdata;
331
        XFilterResult *res;
332
        XFilterStatus status;
333
        int retval = 0;
334

    
335
        XFilterConstructorFunc ctors[] = {
336
                xfilter_textcontent_new,
337
                xfilter_blacklist_new,
338
                xfilter_whitelist_new,
339
                xfilter_wordsep_new,
340
                xfilter_bayes_new,
341
                NULL
342
        };
343

    
344
        if (verbose)
345
                printf("testing message file: %s\n", file);
346

    
347
        mgr = xfilter_manager_new();
348
        xfilter_manager_add_filters(mgr, ctors);
349

    
350
        msgdata = xfilter_message_data_read_file(file, "message/rfc822");
351

    
352
        res = xfilter_manager_run(mgr, msgdata);
353
        if (verbose)
354
                xfilter_result_print(res);
355
        status = xfilter_result_get_status(res);
356
        if (status == XF_JUNK) {
357
                printf("%s: This is a junk mail (prob: %f)\n", file, xfilter_result_get_probability(res));
358
                retval = 0;
359
        } else if (status == XF_UNCERTAIN) {
360
                printf("%s: This mail could not be classified (prob: %f)\n", file, xfilter_result_get_probability(res));
361
                retval = 2;
362
        } else if (status == XF_UNSUPPORTED_TYPE || status == XF_ERROR) {
363
                printf("%s: Error on testing mail\n", file);
364
                retval = 127;
365
        } else {
366
                printf("%s: This is a clean mail (prob: %f)\n", file, xfilter_result_get_probability(res));
367
                retval = 1;
368
        }
369

    
370
        if (xfilter_get_debug_mode()) {
371
                resdata = xfilter_result_get_message_data(res);
372
                print_message_data(resdata);
373
        }
374

    
375
        xfilter_result_free(res);
376
        xfilter_message_data_free(msgdata);
377

    
378
        xfilter_manager_free(mgr);
379

    
380
        return retval;
381
}
382

    
383
static void print_message_data(XMessageData *msgdata)
384
{
385
        const char *content;
386

    
387
        if (!msgdata)
388
                return;
389

    
390
        printf("\n");
391

    
392
        content = xfilter_message_data_get_attribute(msgdata, XM_FROM);
393
        if (content)
394
                printf("from: %s\n", content);
395
        content = xfilter_message_data_get_attribute(msgdata, XM_TO);
396
        if (content)
397
                printf("to: %s\n", content);
398
        content = xfilter_message_data_get_attribute(msgdata, XM_CC);
399
        if (content)
400
                printf("cc: %s\n", content);
401
        content = xfilter_message_data_get_attribute(msgdata, XM_SUBJECT);
402
        if (content)
403
                printf("subject: %s\n", content);
404
        content = xfilter_message_data_get_attribute(msgdata, XM_RECEIVED);
405
        if (content)
406
                printf("received: %s\n", content);
407

    
408
        if (verbose > 2) {
409
                content = xfilter_message_data_get_content(msgdata);
410
                printf("content: %s\n", content);
411
        }
412
}
413

    
414
static void usage(void)
415
{
416
        printf("SylFilter (tentative name) version " VERSION "\n");
417
        printf("\n");
418
        printf("Usage: sylfilter [options] message [message ...]\n");
419
        printf("\n");
420
        printf("Options:\n");
421
        printf("  -j  learn junk (spam) messages\n");
422
        printf("  -c  learn clean (non-spam) messages\n");
423
        printf("  -J  unlearn junk (spam) messages\n");
424
        printf("  -C  unlearn clean (non-spam) messages\n");
425
        printf("  -t  classify messages\n");
426
        printf("  -v  show verbose messages\n");
427
        printf("  -d  show debug messages\n");
428
        printf("  -m n|r\n");
429
        printf("      specify filtering method\n");
430
        printf("      n : Paul Graham (Naive Bayes) method\n");
431
        printf("      r : Gary Robinson (Robinson-Fisher) method (default)\n");
432
        printf("  --min-dev\n");
433
        printf("      ignore if score near (default: 0.1)\n");
434
        printf("  --robs\n");
435
        printf("      Robinson's s parameter (default: 1.0)\n");
436
        printf("  --robx\n");
437
        printf("      Robinson's x parameter (default: 0.5)\n");
438
        printf("  -B  do not bias probability for clean mail\n");
439
        printf("      (Paul/Naive method only, may increase false-positive)\n");
440
        printf("\n");
441
        printf("  -h, --help\n");
442
        printf("      print this help message\n");
443
        printf("\n");
444
        printf("  -E <engine_name>\n");
445
        printf("      specify key-value store engine (show below)\n");
446
        printf("  -p <path>\n");
447
        printf("      specify database directory\n");
448
        printf("\n");
449
        printf("Return values:\n");
450
        printf("  0   junk (spam)\n");
451
        printf("  1   clean (non-spam)\n");
452
        printf("  2   uncertain\n");
453
        printf("  127 other errors\n");
454
        printf("\n");
455
        printf("Database will be created at %s/*.db\n",
456
               xfilter_utils_get_default_base_dir());
457
        printf("\n");
458
        printf("Available key-value stores:\n");
459
#ifdef USE_QDBM
460
        printf("  QDBM\n");
461
#endif
462
#ifdef USE_SQLITE
463
        printf("  SQLite\n");
464
#endif
465
#ifdef USE_GDBM
466
        printf("  GDBM\n");
467
#endif
468
}