#	File: 		furrin.rc
#	Description:	procmail script for foreign character sets
#	Author:		Sean B. Straw
#	Source:		<http://www.professional.org/procmail/furrin.rc>
#	Copyright:	Portions copyright (c) 2000-2003, Sean B. Straw
#	Disclaimer:	<http://www.professional.org/procmail/disclaimer.html>
#	Licensing:	Free for use by the procmail community.
#	Support:	Visit the official procmail discussion list to ask
#			procmail questions.  If you need custom procmail
#			work performed (including modifications to this
#			rcfile), the author is available for paid consulting.
#
#
# This is a procmail recipe file to handle rejecting messages identified as
# employing certain character sets.  Although as used here, it identifies
# messages as "spam" for the authors own purposes, it should NOT be assumed
# that a message in a foreign character encoding is in fact spam.
#
# Users should keep in mind that several character sets are functional
# supersets of the Latin-1 (or similar) character set, and can therefore be
# used to communicate Western European languages in addition to their own
# intended language.
#
# DO NOT AUTO-SUBMIT MESSAGES TO SPAM DATABASES BASED SOLELY UPON THE
# RESULTS OF THIS SCRIPT.
#
# This script may rely upon macros defined outside of this file.
# Additionally, some variables set here are expected to be acted upon by 
# subsequent recipes, rather that dealing with the "spam" right within this
# rcfile.
#
#
# Useful references (no particular order):
# <http://www.iana.org/assignments/character-sets>
# <http://www.unicode.org>
# <http://www.unicode.org/charts/>
# <http://www.iso.ch>
# <http://www.goof.com/pcg/data/marc/iso/locale.txt>
# <http://anubis.dkuug.dk/i18n/charmaps/>
# <http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html>
# <http://www.w3.org/International/O-charset-list.html>
# <http://www.microsoft.com/globaldev/reference/cphome.mspx>
# <http://msdn.microsoft.com/workshop/database/tdc/reference/charset.asp>
# <http://msdn.microsoft.com/workshop/Author/dhtml/reference/charsets/charset4.asp>
# <http://clisp.cons.org/impnotes/encoding.html>
# <http://www.cwi.nl/~aeb/linux/man2html/man7/charsets.7.html>
# <http://www.mozilla.org/quality/intl/chardetect.html>
# <http://www.mozilla.org/docs/l10n/l10nkits/client/windows/docs/nav40/xpencui.htm>
# <http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html>
# <http://www.terena.nl/library/multiling/ml-docs/wincharsets.html>
# <http://www.chilkatsoft.com/ChilkatIConv.asp>
# <http://java.sun.com/j2se/1.4.1/docs/guide/intl/encoding.doc.html>
#
# See also various RFCs, including 1489 and 1557.
#
# ==========================================================================

# Lets start by defining the character sets, grouped by language.
# All of 'em we can lay our hands on, whether you receive them or not.
#
# Obviously, some character sets encompass more than one language set.
# It is adviseable to group them according to the more common language,
# favouring the languages which you're likely to RETAIN.
#
# Out of necessity, some character sets (notably, Cyrillics) have been
# grouped to geo-policial origins as this author understands them.  I'm not
# a linguist, nor do I have a deep understanding of some of these languages.
# If you have information pertaining to proper reassignment of some of these
# character sets, please contact the author (see website).
#
# Properly, the rexexps these are used in are bounded on both sides, so
# "roman" and "romanian" should not collide.
#
CHARSET_JP="WINDOWS-932|EUC-JP|(cs-?)?ISO-?2022-?JP(-[12])?|ISO-2022-D|SHIFT[-_]JIS|JIS[-_]?X[-_]?02(08|01|12|13)|sjis|jis7|ms-kanji|(x-)?mac(-)?japanese|x-EBCDIC-Japanese(Katakana|AndUSCanada|AndJapaneseLatin|AndKana)"
CHARSET_CN="WINDOWS-(936|950)|EUC-CN|(hz-|x-euc-tw)?GB[-_]?2312|(cn-)?(BIG5|gb)|ISO-2022-([EGHIJKLM]|cn|cn-ext)|ISO-IR-165|GB8565\.2(-1988)?|x-euc-tw|hz|iso-ir-58|gbk|big5-hkscs|gb18030|(x-)?mac(-)?chinese(trad|imp)|iso-ir-58|x-EBCDIC-(Traditional|Simplified)Chinese|x-Chinese-(CNS|eten)"
# non-standards compliant variations of chinese
CHARSET_CN_BOGUS="CHINESEBIG5|BIG-5"
CHARSET_KR="WINDOWS-949|EUC-KR|KS[-_ ]?C[-_ ]?5601([-_ ]?1987)?|ISO-2022-(C|kr)|KS[-_]?X[-_]?1001|ksc5636|iso-646-kr|uhc|johab|(x-)?mac(-)?korean|iso-ir-149|x-EBCDIC-(KoreanAnd)?KoreanExtended"
# some mailer actually sets this
CHARSET_BOGUS="X-UNKNOWN|USER-DEFINED"
# Not recommended to block these - they're all rather encompassing
CHARSET_UNICODE="UTF(-)?(7|8|16)]|UCS(-)?(2|4)|UNICODE-1-1-UTF-7|ISO-10646-UCS-2|UNICODE-(16|32)(LITTLE|BIG)-ENDIAN)?|unicodeFFFE|JAVA|x-EBCDIC-International(-euro)?"
# If you're english, you probably don't want to block this one either.
CHARSET_ENG="US-ASCII|ASCII|iso-ir-6|iso646-us|x-EBCDIC-(cp-us|UK)(-euro)?"
# Western European (English, but also French and many others.  Standard)
CHARSET_WESTEURO="WINDOWS-1252|ISO-?8859-(1|15)|iso-ir-100|(x-)?mac(-)?roman|latin-?(1|9)|macintosh|x-IA5(-German)?|x-ebcdic-(spain|italy|germany|france)(-euro)?|x-europa"
# Central/Eastern European (non-english)
CHARSET_SLAVIC="WINDOWS-1250|ISO-?8859-(2|16)|iso-ir-(87|102)|(x-)?mac(-)?(central-europe|ce|croatian)|latin-?2|CP870"
# uncommon stuff and/or generally obsoleted.  Includes maltese (eh, sorry if that's you)
CHARSET_FUNKYLATIN="ISO-?8859-[34]|iso-ir-109|latin-?3"
# Russian, et-al.
# KOI8-T is Tajiki (Tajikistan)
# armscii-8 is Armenian
CHARSET_CYRILLIC="WINDOWS-1251|ISO-?8859-5|KOI8(-(RU|[RTU]))?|ISO-IR-(101|111|144|147)|IBM866|(x-)?mac(-)?(romanian?|cyrillic|ukran(e|ian))|nunacom-8|armscii-8|x-EBCDIC-Cyrillic(SerbianBulgarian|Russian)"
# Arabic
CHARSET_ARABIC="WINDOWS-1256|ISO-?8859-6|iso-ir-127|(x-)?mac(-)?arabic|asmo-708|x-EBCDIC-Arabic"
# Greek
CHARSET_GREEK="WINDOWS-1253|ISO-?8859-7|(x-)?mac(-)?greek|iso-ir-(126|150)|x-EBCDIC-Greek(Modern)?"
# Hebrew
CHARSET_HEBREW="WINDOWS-1255|ISO-?8859-8(-i)?|(x-)?mac(-)?hebrew|iso-ir-138|x-EBCDIC-Hebrew"
# Turkish
CHARSET_TURKISH="WINDOWS-1254|ISO-?8859-9|(x-)?mac(-)?turkish|iso-ir-(109|148)|latin-?5|x-EBCDIC-Turkish|CP1026"
# Icelandic/Nordic (i.e. Iceland, Greenland, Norway, Sweden...)
CHARSET_NORDIC="ISO-?8859-10|(x-)?mac(-)?iceland(ic)?|iso-ir-60|x-IA5-(Norwegian|Swedish)|x-EBCDIC-(FinlandSweden|DenmarkNorway|Icelandic)(-euro)?"
# Thai (ISO not _actually_ used, but draft standard is same)
CHARSET_THAI="WINDOWS-874|TIS[-_]?620|ISO-?8859-11|mulelao-1|ibm-cp1133|(x-)?mac(-)?thai|x-EBCDIC-Thai"
# ISO-8859-12 is bogus (was suggested to be vietnamese, but can't fit).
# However, I've seen this encoding specified in spam though, and lacking an
# official designation, I'm hocking it here.
CHARSET_VIETNAM="WINDOWS-1258|ISO-?8859-12|viscii|tcvn5712|vps"
# Baltic Rim
CHARSET_BALTIC="WINDOWS-1257|ISO-?8859-13|iso-ir-110"
# Celtic (Irish and Welsh)
CHARSET_CELTIC="ISO-?8859-14"
# Other stuff which escapes categorization at this time
CHARSET_MISC="isiri-3342|x-iscii-(as|be|de|gu|ka|ma|or|pa|ta|te)"

# Include desired subsets (which are defined above) here.  This defines
# the languages encodings we do not want to recieve.
# Make sure OR condition exists only between those which you employ (i.e.
# that there are not EMPTY OR condition sets)

# As provided, this particular set includes all the languages which
# this author does not correspond using.
# DO NOT simply utilize this configuration without first reviewing it.

CHARSETS="${CHARSET_CN}|${CHARSET_CN_BOGUS}|${CHARSET_KR}|${CHARSET_JP}|${CHARSET_BOGUS}|${CHARSET_SLAVIC}|${CHARSET_FUNKYLATIN}|${CHARSET_CYRILLIC}|${CHARSET_ARABIC}|${CHARSET_GREEK}|${CHARSET_HEBREW}|${CHARSET_TURKISH}|${CHARSET_THAI}|${CHARSET_VIETNAM}|${CHARSET_BALTIC}|${CHARSET_MISC}"

# Ok, that absolute DOOZIE of a regexp is now defined.  Let's go use it...

# ==========================================================================

# Actual recipes using the defined regexp

# Messages identifying the character set in the From: or Subject:
:0
* $ ^(From|Subject):${wsstar}=\?\/(${CHARSETS})\?[QB]
{
	# This scrubs the delimiters from the MATCH string,
	# leaving us with just the text of the matched charset descriptor.
	:0
	* MATCH ?? ()\/[^?]+
	{
		SPAMVAL="+300"
		SPAMMISHNESS="${SPAMMISHNESS}${SPAMVAL}"
		SPAMNOTES="${SPAMNOTES}SPAM: ${SPAMVAL} Foreign character set encoding (${MATCH}) used in From or Subject.${NL}"
	}
}

# Messages identifying the character set in the Content-Type: *HEADER*
# (you can expand this to cover the body as well as headers, by adding
# "HB" flags)
:0
* $ ^Content-Type:.*charset=(\")?\/(${CHARSETS})(\")?\>
{
	# This scrubs the delimiters from the MATCH string,
	# leaving us with just the text of the matched charset descriptor.
	:0
	* MATCH ?? ()\/[^?";]+
	{
		SPAMVAL="+300"
		SPAMMISHNESS="${SPAMMISHNESS}${SPAMVAL}"
		SPAMNOTES="${SPAMNOTES}SPAM: ${SPAMVAL} Foreign character set encoding (${MATCH}) in body.${NL}"
	}
}

# Check for hibit characters in the subject
# (character class contains 0x80 - 0xff character range)
## also try in From: and To:
:0
* ^(Subject|From|To):\/.*[€-ÿ]
{
	:0
	* -2^0
	* 1^1 MATCH ?? [€-ÿ]
	{
		SPAMVAL="+(${SPAMTHRESHOLD}*2)"
		SPAMMISHNESS="${SPAMMISHNESS}${SPAMVAL}"
		SPAMNOTES="${SPAMNOTES}SPAM: ${SPAMVAL} raw 8-bit characters in the Subject/From/To${NL}"
	}
}

# ==========================================================================

# The module which includes this one should take action based on variables
# which are set in the recipes above.