/* SylFilter - a message filter
 *
 * Copyright (C) 2011 Hiroyuki Yamamoto
 * Copyright (C) 2011 Sylpheed Development Team
 */

#include <glib.h>
#include <string.h>

#include "filter.h"
#include "wordsep-filter.h"

typedef enum
{
	UC_OTHER,
	UC_HIRA,
	UC_KATA,
	UC_HAN
} UCType;

#define MAX_TOKEN_LEN	42
#define NGRAM_LEN	4	/* 4-gram */

#define URANGE(c, start, end) ((c) >= start && (c) <= end)

#define u_is_hira(c)	(URANGE(c, 0x3041, 0x3096) ||	\
			 URANGE(c, 0x309d, 0x309f))
#define u_is_kata(c)	(URANGE(c, 0x30a1, 0x30fa) ||	\
			 URANGE(c, 0x30fd, 0x30ff) ||	\
			 URANGE(c, 0x31f0, 0x31ff) ||	\
			 URANGE(c, 0xff66, 0xff6f) ||	\
			 URANGE(c, 0xff71, 0xff9d))
#define u_is_han(c)	(URANGE(c, 0x2e80, 0x2fdf) ||	\
			 (c) == 0x3005 ||		\
			 (c) == 0x3007 ||		\
			 URANGE(c, 0x3021, 0x3029) ||	\
			 URANGE(c, 0x3038, 0x303b) ||	\
			 URANGE(c, 0x3400, 0x4dbf) ||	\
			 URANGE(c, 0x4e00, 0x9fff) ||	\
			 URANGE(c, 0xf900, 0xfaff) ||	\
			 URANGE(c, 0x20000, 0x2ffff))

#define APPEND_STR(s, w)			\
{						\
	if (s->len > 0)				\
		g_string_append_c(s, ' ');	\
	if (is_url)				\
		g_string_append(s, "Url*");	\
	g_string_append(s, w);			\
}


static void append_ngram_str(GString *str, const char *word, gboolean is_url)
{
	const char *p = word;
	const char *bp;
	int len;

	xfilter_debug_print("append_ngram_str: %s\n", word);

	while (*p != '\0') {
		len = 0;
		bp = p;
		while (len < NGRAM_LEN) {
			p = g_utf8_next_char(p);
			len++;
			if (*p == '\0')
				break;
		}
		if (len < NGRAM_LEN)
			break;

		if (str->len > 0)
			g_string_append_c(str, ' ');
		if (is_url)
			g_string_append(str, "Url*");
		g_string_append_len(str, bp, p - bp);
		xfilter_debug_print("n-gram: %.*s\n", p - bp, bp);

		p = g_utf8_next_char(bp);
	}
}

static char *do_wordsep(const char *content, gboolean drop_punct,
			gboolean drop_single_letter, gboolean drop_two_hira,
			gboolean normalize, gboolean mb_ngram,
			gboolean is_url, gboolean is_header)
{
	GString *out_str;
	const char *p = content;
	const char *bp = content;
	UCType prev_uctype = UC_OTHER;
	UCType prev2_uctype = UC_OTHER;
	GUnicodeType prev_utype = G_UNICODE_OTHER_LETTER;
	GUnicodeBreakType prev_btype;
	gboolean prev_is_wide = FALSE;
	gboolean prev_is_space = FALSE;
	gboolean prev_is_punct = FALSE;
	gboolean prev_is_digit = FALSE;
	gint word_len = 0;

	out_str = g_string_new("");

	while (*p != '\0') {
		gunichar wc;
		UCType uctype = UC_OTHER;
		GUnicodeType utype = G_UNICODE_OTHER_LETTER;
		GUnicodeBreakType btype;
		gboolean is_wide, is_space, is_punct, is_digit;
		gboolean do_break = FALSE;
		gboolean is_drop = FALSE;
		gboolean do_ngram = FALSE;

		if (!is_header && !is_url && p == bp && *p == 'h') {
			if (!strncmp(p, "http://", 7))
				p += 7;
			else if (!strncmp(p, "https://", 8))
				p += 8;
			if (p > bp) {
				if (!strncmp(p, "www.", 4))
					p += 4;
			}

			if (p > bp) {
				const char *ep = p;
				char *url_str, *sep_str;

				while (*ep != '\0' && g_ascii_isgraph(*ep))
					ep++;
				url_str = g_strndup(p, ep - p);
				sep_str = do_wordsep(url_str, drop_punct, drop_single_letter, drop_two_hira, normalize, mb_ngram, TRUE, is_header);
				if (out_str->len > 0)
					g_string_append_c(out_str, ' ');
				g_string_append(out_str, sep_str);
				g_free(url_str);
				word_len = 0;
				bp = p = ep;
				continue;
			}
		}

		wc = g_utf8_get_char(p);
		utype = g_unichar_type(wc);
		btype = g_unichar_break_type(wc);
		is_wide = g_unichar_iswide(wc) || wc == 0x25cb ||
			URANGE(wc, 0x25ce, 0x25d1);
		is_space = g_unichar_isspace(wc);
		is_punct = (*p != '!' && *p != '$' && *p != '\'' && *p != '-')
			&& g_unichar_ispunct(wc);
		is_digit = g_unichar_isdigit(wc);
		if (is_wide) {
			if (u_is_hira(wc))
				uctype = UC_HIRA;
			else if (u_is_kata(wc))
				uctype = UC_KATA;
			else if (u_is_han(wc))
				uctype = UC_HAN;
		}
#if 0
		if (utype == G_UNICODE_DASH_PUNCTUATION ||
		    utype == G_UNICODE_OTHER_SYMBOL)
			is_punct = FALSE;
#endif
		/* handle 1,234,567 / 192.168.0.1 */
		if (*p == '.' || *p == ',') {
			if (prev_is_digit) {
				const char *np;
				gunichar nwc;
				np = g_utf8_next_char(p);
				nwc = g_utf8_get_char(np);
				if (g_unichar_isdigit(nwc))
					is_punct = FALSE;
			}
		}

#if 0
		{
			gchar s[7] = {0};
			g_unichar_to_utf8(wc, s);
			xfilter_debug_print("%s: utype=%d word_len=%d\n", s, utype, word_len);
		}
#endif

		if (p > bp) {
			if (prev_is_wide != is_wide)
				do_break = TRUE;
			else if (prev_is_punct != is_punct)
				do_break = TRUE;
			else if (is_space || is_punct)
				do_break = TRUE;
			else if (prev_uctype != UC_HAN && uctype == UC_HAN)
				do_break = TRUE;
			else if (prev_uctype == UC_HAN && uctype != UC_HAN)
				do_break = TRUE;
			else if (prev_uctype == UC_KATA &&
				 uctype != UC_KATA &&
				 utype != G_UNICODE_MODIFIER_LETTER &&
				 utype != G_UNICODE_OTHER_SYMBOL)
				do_break = TRUE;
			else if (prev_uctype == UC_HIRA &&
				 uctype != UC_HIRA &&
				 utype != G_UNICODE_MODIFIER_LETTER &&
				 utype != G_UNICODE_OTHER_SYMBOL)
				do_break = TRUE;
			else if (prev_uctype == UC_OTHER &&
				 uctype != UC_OTHER &&
				 prev_utype != G_UNICODE_MODIFIER_LETTER)
				do_break = TRUE;
			else if (prev_utype == G_UNICODE_MODIFIER_LETTER &&
				 prev2_uctype != uctype)
				do_break = TRUE;
		}

		if (drop_punct) {
			is_drop = prev_is_punct;
		}
		if (drop_single_letter) {
			if (word_len == 1 && do_break && prev_uctype != UC_HAN)
				is_drop = TRUE;
		}
		if (drop_two_hira) {
			if (word_len == 2 && do_break && prev2_uctype == UC_HIRA && prev_uctype == UC_HIRA)
				is_drop = TRUE;
		}

		if (word_len > MAX_TOKEN_LEN && do_break)
			is_drop = TRUE;

		if (mb_ngram && !is_drop && do_break &&
		    prev_uctype == UC_HAN && word_len > NGRAM_LEN)
			do_ngram = TRUE;

		prev2_uctype = prev_uctype;
		prev_uctype = uctype;
		prev_utype = utype;
		prev_btype = btype;
		prev_is_wide = is_wide;
		prev_is_space = is_space;
		prev_is_punct = is_punct;
		prev_is_digit = is_digit;

		if (do_break) {
			gchar *word;

			word_len = 0;

			if (!is_drop) {
				if (normalize)
					word = g_ascii_strdown(bp, p - bp);
				else
					word = g_strndup(bp, p - bp);
				if (do_ngram)
					append_ngram_str(out_str, word, is_url);
				else
					APPEND_STR(out_str, word);
				g_free(word);
			}

			if (is_space) {
				do {
					p = g_utf8_next_char(p);
					wc = g_utf8_get_char(p);
				} while (g_unichar_isspace(wc));
				bp = p;
				continue;
			}
			bp = p;
		} else {
			if (is_space) {
				word_len = 0;
				do {
					p = g_utf8_next_char(p);
					wc = g_utf8_get_char(p);
				} while (g_unichar_isspace(wc));
				bp = p;
				continue;
			}
		}

		p = g_utf8_next_char(p);
		word_len++;
	}

	if (p > bp) {
		gchar *word;
		gboolean is_drop = FALSE;
		gboolean do_ngram = FALSE;

		if (drop_punct) {
			is_drop = prev_is_punct;
		}
		if (drop_single_letter) {
			if (word_len == 1 && prev_uctype != UC_HAN)
				is_drop = TRUE;
		}
		if (drop_two_hira) {
			if (word_len == 2 && prev2_uctype == UC_HIRA && prev_uctype == UC_HIRA)
				is_drop = TRUE;
		}

		if (word_len > MAX_TOKEN_LEN)
			is_drop = TRUE;

		if (mb_ngram && prev_uctype == UC_HAN && word_len > NGRAM_LEN)
			do_ngram = TRUE;

		if (!is_drop) {
			if (normalize)
				word = g_ascii_strdown(bp, -1);
			else
				word = g_strdup(bp);
			if (do_ngram)
				append_ngram_str(out_str, word, is_url);
			else
				APPEND_STR(out_str, word);
			g_free(word);
		}
	}

	return g_string_free(out_str, FALSE);
}

static XFilterStatus xfilter_wordsep_func(XFilter *filter, const XMessageData *data, XFilterResult *result)
{
	const char *mime_type;
	const char *content = NULL;
	char *processed_content;
	XMessageData *msgdata;

	g_return_val_if_fail(result != NULL, XF_ERROR);

	mime_type = xfilter_message_data_get_mime_type(data);
	if (!mime_type) {
		xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
		return XF_UNSUPPORTED_TYPE;
	}

	if (!g_strncasecmp(mime_type, "text/", 5))
		content = xfilter_message_data_get_content(data);
	else {
		xfilter_result_set_status(result, XF_UNSUPPORTED_TYPE);
		return XF_UNSUPPORTED_TYPE;
	}

	processed_content = do_wordsep(content, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE);

	msgdata = xfilter_message_data_new(NULL, mime_type);
	xfilter_message_data_set_content(msgdata, processed_content);

#define WORDSEP_HEADER(attr)						\
	content = xfilter_message_data_get_attribute(data, attr);	\
	if (content) {							\
		xfilter_debug_print("header: %s\n", content);		\
		processed_content =					\
			do_wordsep(content, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE); \
		xfilter_message_data_set_attribute			\
			(msgdata, attr, processed_content, FALSE);	\
		g_free(processed_content);				\
	}

	WORDSEP_HEADER(XM_FROM);
	WORDSEP_HEADER(XM_TO);
	WORDSEP_HEADER(XM_CC);
	WORDSEP_HEADER(XM_SUBJECT);
	WORDSEP_HEADER(XM_RECEIVED);

#undef WORDSEP_HEADER

	xfilter_result_set_message_data(result, msgdata);

	xfilter_result_set_status(result, XF_REWRITTEN);

	return XF_REWRITTEN;
}

XFilter *xfilter_wordsep_new(void)
{
	XFilter *filter;

	filter = xfilter_new(XF_CONTENT, "wordsep");
	xfilter_set_content_filter_func(X_CONTENT_FILTER(filter),
					xfilter_wordsep_func);

	return filter;
}
