Statistics
| Branch: | Tag: | Revision:

root / src / sylfilter.c @ 6899d5dd

History | View | Annotate | Download (11.5 KB)

1
/* SylFilter - a message filter
2
 *
3
 * Copyright (C) 2011 Hiroyuki Yamamoto
4
 * Copyright (C) 2011 Sylpheed Development Team
5
 */
6

    
7
#include "config.h"
8

    
9
#include <stdio.h>
10
#include <string.h>
11
#include <locale.h>
12

    
13
#include "filter.h"
14
#include "filter-manager.h"
15
#include "filter-utils.h"
16
#include "filter-kvs.h"
17

    
18
#ifdef USE_QDBM
19
#  include "filter-kvs-qdbm.h"
20
#endif
21
#ifdef USE_SQLITE
22
#  include "filter-kvs-sqlite.h"
23
#endif
24
#ifdef USE_GDBM
25
#  include "filter-kvs-gdbm.h"
26
#endif
27

    
28
#include "textcontent-filter.h"
29
#include "blacklist-filter.h"
30
#include "whitelist-filter.h"
31
#include "wordsep-filter.h"
32
#include "bayes-filter.h"
33

    
34
enum {
35
        MODE_TEST_JUNK,
36
        MODE_LEARN,
37
        MODE_SHOW_STATUS
38
};
39

    
40
enum {
41
        MODE_LEARN_NONE = 0,
42
        MODE_LEARN_JUNK = 1,
43
        MODE_LEARN_CLEAN = 1 << 1,
44
        MODE_UNLEARN_JUNK = 1 << 2,
45
        MODE_UNLEARN_CLEAN = 1 << 3,
46
};
47

    
48
static int verbose = 0;
49

    
50
static int learn_filter(int mode, const char *file);
51
static int test_filter(int mode, const char *file);
52
static void print_message_data(XMessageData *msgdata);
53
static void version(void);
54
static void usage(void);
55

    
56

    
57
int main(int argc, char *argv[])
58
{
59
        int retval = 2;
60
        int i;
61
        int mode = MODE_TEST_JUNK;
62
        int learn_mode = MODE_LEARN_NONE;
63
        int no_bias = 0;
64
        const char *method = NULL;
65
        const char *min_dev = NULL;
66
        const char *robs = NULL;
67
        const char *robx = NULL;
68
        int count = 0;
69
        const char *dbpath = NULL;
70
#ifdef USE_QDBM
71
        const char *engine = "qdbm";
72
#elif defined(USE_SQLITE)
73
        const char *engine = "sqlite";
74
#elif defined(USE_GDBM)
75
        const char *engine = "gdbm";
76
#endif
77

    
78
        setlocale(LC_ALL, "");
79

    
80
        for (i = 1; i < argc; i++) {
81
                if (!strcmp(argv[i], "-j")) {
82
                        mode = MODE_LEARN;
83
                        learn_mode |= MODE_LEARN_JUNK;
84
                } else if (!strcmp(argv[i], "-c")) {
85
                        mode = MODE_LEARN;
86
                        learn_mode |= MODE_LEARN_CLEAN;
87
                } if (!strcmp(argv[i], "-J")) {
88
                        mode = MODE_LEARN;
89
                        learn_mode |= MODE_UNLEARN_JUNK;
90
                } else if (!strcmp(argv[i], "-C")) {
91
                        mode = MODE_LEARN;
92
                        learn_mode |= MODE_UNLEARN_CLEAN;
93
                } else if (!strcmp(argv[i], "-t"))
94
                        mode = MODE_TEST_JUNK;
95
                else if (!strcmp(argv[i], "-s"))
96
                        mode = MODE_SHOW_STATUS;
97
                else if (!strncmp(argv[i], "-v", 2)) {
98
                        verbose = 1;
99
                        if (argv[i][2] == 'v') {
100
                                verbose++;
101
                                if (argv[i][3] == 'v')
102
                                        verbose++;
103
                        }
104
                } else if (!strcmp(argv[i], "-d"))
105
                        xfilter_set_debug_mode(1);
106
                else if (!strcmp(argv[i], "-E")) {
107
                        i++;
108
                        if (i >= argc) {
109
                                usage();
110
                                return 1;
111
                        }
112
                        engine = argv[i];
113
                } else if (!strcmp(argv[i], "-B")) {
114
                        no_bias = 1;
115
                } else if (!strcmp(argv[i], "-m")) {
116
                        i++;
117
                        if (i >= argc) {
118
                                usage();
119
                                return 1;
120
                        }
121
                        if (argv[i][0] == 'n')
122
                                method = "n";
123
                } else if (!strcmp(argv[i], "-p")) {
124
                        i++;
125
                        if (i >= argc) {
126
                                usage();
127
                                return 1;
128
                        }
129
                        dbpath = argv[i];
130
                } else if (argv[i][0] == '-' && argv[i][1] == '-') {
131
                        if (!strcmp(argv[i] + 2, "min-dev")) {
132
                                i++;
133
                                if (i >= argc) {
134
                                        usage();
135
                                        return 1;
136
                                }
137
                                min_dev = argv[i];
138
                        } else if (!strcmp(argv[i] + 2, "robs")) {
139
                                i++;
140
                                if (i >= argc) {
141
                                        usage();
142
                                        return 1;
143
                                }
144
                                robs = argv[i];
145
                        } else if (!strcmp(argv[i] + 2, "robx")) {
146
                                i++;
147
                                if (i >= argc) {
148
                                        usage();
149
                                        return 1;
150
                                }
151
                                robx = argv[i];
152
                        } else if (!strncmp(argv[i] + 2, "help", 4)) {
153
                                usage();
154
                                return 0;
155
                        }
156
                } else if (!strcmp(argv[i], "-h")) {
157
                        usage();
158
                        return 0;
159
                } else if (!strcmp(argv[i], "-V")) {
160
                        version();
161
                        return 0;
162
                }
163
        }
164

    
165
        xfilter_init(XF_APP_MODE_STANDALONE);
166

    
167
#ifdef USE_QDBM
168
        if (!strcasecmp(engine, "qdbm"))
169
                xfilter_kvs_qdbm_set_engine();
170
#else
171
        if (0) {}
172
#endif
173
#ifdef USE_SQLITE
174
        else if (!strcasecmp(engine, "sqlite"))
175
                xfilter_kvs_sqlite_set_engine();
176
#else
177
        else if (0) {}
178
#endif
179
#ifdef USE_GDBM
180
        else if (!strcasecmp(engine, "gdbm"))
181
                xfilter_kvs_gdbm_set_engine();
182
#else
183
        else if (0) {}
184
#endif
185
        else {
186
                fprintf(stderr, "Engine '%s' not supported.\n", engine);
187
                xfilter_done();
188
                return 127;
189
        }
190

    
191
        if (verbose)
192
                printf("engine %s has been selected\n", engine);
193

    
194
        /* set global config values */
195
        if (no_bias)
196
                xfilter_set_conf_value("no-bias", "t");
197
        if (method)
198
                xfilter_set_conf_value("method", method);
199
        if (min_dev)
200
                xfilter_set_conf_value("min-dev", min_dev);
201
        if (robs)
202
                xfilter_set_conf_value("robs", robs);
203
        if (robx)
204
                xfilter_set_conf_value("robx", robx);
205

    
206
        if (xfilter_utils_set_base_dir(dbpath) < 0) {
207
                fprintf(stderr, "Could not create base directory.\n");
208
                xfilter_done();
209
                return 127;
210
        }
211

    
212
        dbpath = xfilter_utils_get_base_dir();
213
        if (xfilter_bayes_db_init(dbpath) < 0) {
214
                fprintf(stderr, "Database initialization error.\n");
215
                xfilter_done();
216
                return 127;
217
        }
218

    
219
#define ARGV_OPTION_WITH_PARAM(arg)                                \
220
        (!strcmp(arg, "-E") || !strcmp(arg, "-m") ||                \
221
         !strcmp(arg, "-p") ||                                        \
222
         !strcmp(arg, "--min-dev") || !strcmp(arg, "--robs") ||        \
223
         !strcmp(arg, "--robx"))
224

    
225
        if (mode == MODE_SHOW_STATUS) {
226
                retval = xfilter_bayes_db_show_contents(verbose);
227
        } else if (mode == MODE_LEARN) {
228
                for (i = 1; i < argc; i++) {
229
                        if (ARGV_OPTION_WITH_PARAM(argv[i])) {
230
                                i++;
231
                                if (i >= argc)
232
                                        break;
233
                        } else if (argv[i][0] != '-') {
234
                                retval = learn_filter(learn_mode, argv[i]);
235
                                if (retval != 0)
236
                                        break;
237
                                count++;
238
                        }
239
                }
240
        } else {
241
                for (i = 1; i < argc; i++) {
242
                        if (ARGV_OPTION_WITH_PARAM(argv[i])) {
243
                                i++;
244
                                if (i >= argc)
245
                                        break;
246
                        } else if (argv[i][0] != '-') {
247
                                retval = test_filter(mode, argv[i]);
248
                                if (retval == 127)
249
                                        break;
250
                                count++;
251
                        }
252
                }
253
        }
254

    
255
        xfilter_bayes_db_done();
256
        xfilter_done();
257

    
258
        if (mode != MODE_SHOW_STATUS && count == 0)
259
                fprintf(stderr, "No input file.\n");
260

    
261
        if (verbose)
262
                printf("return value: %d\n", retval);
263

    
264
        return retval;
265
}
266

    
267
static int learn_filter(int mode, const char *file)
268
{
269
        XFilterManager *mgr;
270
        XMessageData *msgdata;
271
        XMessageData *resdata;
272
        XFilterResult *res;
273
        XFilterStatus status;
274
        int retval = 0;
275

    
276
        if (verbose)
277
                printf("learning message file: %s\n", file);
278

    
279
        if ((mode & (MODE_LEARN_JUNK | MODE_LEARN_CLEAN | MODE_UNLEARN_JUNK | MODE_UNLEARN_CLEAN)) == 0) {
280
                fprintf(stderr, "no learn mode specified\n");
281
                return 2;
282
        }
283
        if ((mode & (MODE_LEARN_JUNK | MODE_UNLEARN_CLEAN)) != 0 &&
284
            (mode & (MODE_LEARN_CLEAN | MODE_UNLEARN_JUNK)) != 0) {
285
                fprintf(stderr, "-j/-C and -c/-J cannot be specified at the same time\n");
286
                return 2;
287
        }
288
        if ((mode & (MODE_LEARN_CLEAN | MODE_UNLEARN_JUNK)) != 0 &&
289
            (mode & (MODE_LEARN_JUNK | MODE_UNLEARN_CLEAN)) != 0) {
290
                fprintf(stderr, "-c/-J and -j/-C cannot be specified at the same time\n");
291
                return 2;
292
        }
293

    
294
        mgr = xfilter_manager_new();
295
        xfilter_manager_filter_add(mgr, xfilter_textcontent_new());
296
        xfilter_manager_filter_add(mgr, xfilter_wordsep_new());
297

    
298
        if (mode & MODE_LEARN_JUNK)
299
                xfilter_manager_filter_add(mgr, xfilter_bayes_learn_junk_new());
300
        if (mode & MODE_LEARN_CLEAN)
301
                xfilter_manager_filter_add(mgr, xfilter_bayes_learn_nojunk_new());
302
        if (mode & MODE_UNLEARN_JUNK)
303
                xfilter_manager_filter_add(mgr, xfilter_bayes_unlearn_junk_new());
304
        if (mode & MODE_UNLEARN_CLEAN)
305
                xfilter_manager_filter_add(mgr, xfilter_bayes_unlearn_nojunk_new());
306

    
307
        msgdata = xfilter_message_data_read_file(file, "message/rfc822");
308

    
309
        res = xfilter_manager_run(mgr, msgdata);
310
        if (verbose)
311
                xfilter_result_print(res);
312
        status = xfilter_result_get_status(res);
313
        if (status == XF_UNSUPPORTED_TYPE || status == XF_ERROR) {
314
                fprintf(stderr, "%s: Error on learning mail\n", file);
315
                retval = 127;
316
        }
317

    
318
        if (xfilter_get_debug_mode()) {
319
                resdata = xfilter_result_get_message_data(res);
320
                print_message_data(resdata);
321
        }
322

    
323
        xfilter_result_free(res);
324
        xfilter_message_data_free(msgdata);
325
        xfilter_manager_free(mgr);
326

    
327
        return retval;
328
}
329

    
330
static int test_filter(int mode, const char *file)
331
{
332
        XFilterManager *mgr;
333
        XMessageData *msgdata;
334
        XMessageData *resdata;
335
        XFilterResult *res;
336
        XFilterStatus status;
337
        int retval = 0;
338

    
339
        XFilterConstructorFunc ctors[] = {
340
                xfilter_textcontent_new,
341
                xfilter_blacklist_new,
342
                xfilter_whitelist_new,
343
                xfilter_wordsep_new,
344
                xfilter_bayes_new,
345
                NULL
346
        };
347

    
348
        if (verbose)
349
                printf("testing message file: %s\n", file);
350

    
351
        mgr = xfilter_manager_new();
352
        xfilter_manager_add_filters(mgr, ctors);
353

    
354
        msgdata = xfilter_message_data_read_file(file, "message/rfc822");
355

    
356
        res = xfilter_manager_run(mgr, msgdata);
357
        if (verbose)
358
                xfilter_result_print(res);
359
        status = xfilter_result_get_status(res);
360
        if (status == XF_JUNK) {
361
                printf("%s: This is a junk mail (prob: %f)\n", file, xfilter_result_get_probability(res));
362
                retval = 0;
363
        } else if (status == XF_UNCERTAIN) {
364
                printf("%s: This mail could not be classified (prob: %f)\n", file, xfilter_result_get_probability(res));
365
                retval = 2;
366
        } else if (status == XF_UNSUPPORTED_TYPE || status == XF_ERROR) {
367
                printf("%s: Error on testing mail\n", file);
368
                retval = 127;
369
        } else {
370
                printf("%s: This is a clean mail (prob: %f)\n", file, xfilter_result_get_probability(res));
371
                retval = 1;
372
        }
373

    
374
        if (xfilter_get_debug_mode()) {
375
                resdata = xfilter_result_get_message_data(res);
376
                print_message_data(resdata);
377
        }
378

    
379
        xfilter_result_free(res);
380
        xfilter_message_data_free(msgdata);
381

    
382
        xfilter_manager_free(mgr);
383

    
384
        return retval;
385
}
386

    
387
static void print_message_data(XMessageData *msgdata)
388
{
389
        const char *content;
390

    
391
        if (!msgdata)
392
                return;
393

    
394
        printf("\n");
395

    
396
        content = xfilter_message_data_get_attribute(msgdata, XM_FROM);
397
        if (content)
398
                printf("from: %s\n", content);
399
        content = xfilter_message_data_get_attribute(msgdata, XM_TO);
400
        if (content)
401
                printf("to: %s\n", content);
402
        content = xfilter_message_data_get_attribute(msgdata, XM_CC);
403
        if (content)
404
                printf("cc: %s\n", content);
405
        content = xfilter_message_data_get_attribute(msgdata, XM_SUBJECT);
406
        if (content)
407
                printf("subject: %s\n", content);
408
        content = xfilter_message_data_get_attribute(msgdata, XM_RECEIVED);
409
        if (content)
410
                printf("received: %s\n", content);
411

    
412
        if (verbose > 2) {
413
                content = xfilter_message_data_get_content(msgdata);
414
                printf("content: %s\n", content);
415
        }
416
}
417

    
418
static void version(void)
419
{
420
        printf("SylFilter (tentative name) version " VERSION "\n");
421
}
422

    
423
static void usage(void)
424
{
425
        version();
426
        printf("\n");
427
        printf("Usage: sylfilter [options] message [message ...]\n");
428
        printf("\n");
429
        printf("Options:\n");
430
        printf("  -j  learn junk (spam) messages\n");
431
        printf("  -c  learn clean (non-spam) messages\n");
432
        printf("  -J  unlearn junk (spam) messages\n");
433
        printf("  -C  unlearn clean (non-spam) messages\n");
434
        printf("  -t  classify messages\n");
435
        printf("  -v  show verbose messages\n");
436
        printf("  -d  show debug messages\n");
437
        printf("  -m n|r\n");
438
        printf("      specify filtering method\n");
439
        printf("      n : Paul Graham (Naive Bayes) method\n");
440
        printf("      r : Gary Robinson (Robinson-Fisher) method (default)\n");
441
        printf("  --min-dev\n");
442
        printf("      ignore if score near (default: 0.1)\n");
443
        printf("  --robs\n");
444
        printf("      Robinson's s parameter (default: 1.0)\n");
445
        printf("  --robx\n");
446
        printf("      Robinson's x parameter (default: 0.5)\n");
447
        printf("  -B  do not bias probability for clean mail\n");
448
        printf("      (Paul/Naive method only, may increase false-positive)\n");
449
        printf("\n");
450
        printf("  -V  print version\n");
451
        printf("  -h, --help\n");
452
        printf("      print this help message\n");
453
        printf("\n");
454
        printf("  -E <engine_name>\n");
455
        printf("      specify key-value store engine (show below)\n");
456
        printf("  -p <path>\n");
457
        printf("      specify database directory\n");
458
        printf("\n");
459
        printf("Return values:\n");
460
        printf("  0   junk (spam)\n");
461
        printf("  1   clean (non-spam)\n");
462
        printf("  2   uncertain\n");
463
        printf("  127 other errors\n");
464
        printf("\n");
465
        printf("Default database location: %s/*.db\n",
466
               xfilter_utils_get_default_base_dir());
467
        printf("\n");
468
        printf("Available key-value stores:\n");
469
#ifdef USE_QDBM
470
        printf("  QDBM\n");
471
#endif
472
#ifdef USE_SQLITE
473
        printf("  SQLite\n");
474
#endif
475
#ifdef USE_GDBM
476
        printf("  GDBM\n");
477
#endif
478
}