proton-bridge/pkg/mime/encoding_test.go

458 lines
16 KiB
Go
Raw Normal View History

2023-01-02 10:02:26 +00:00
// Copyright (c) 2023 Proton AG
//
// This file is part of Proton Mail Bridge.
//
// Proton Mail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Proton Mail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Proton Mail Bridge. If not, see <https://www.gnu.org/licenses/>.
package pmmime
import (
"bytes"
"strings"
"testing"
"golang.org/x/text/encoding/htmlindex"
a "github.com/stretchr/testify/assert"
)
func TestDecodeHeader(t *testing.T) {
testData := []struct{ raw, expected string }{
{
"",
"",
},
{
"=?iso-2022-jp?Q?=1B$B!Z=1B(BTimes_Car_PLUS=1B$B![JV5Q>Z=1B(B?=",
"【Times Car PLUS】返却証",
},
{
`=?iso-2022-jp?Q?iTunes_Movie_=1B$B%K%e!<%j%j!<%9$HCmL\:nIJ=1B(B?=`,
"iTunes Movie ニューリリースと注目作品",
},
{
"=?UTF-8?B?w4TDi8OPw5bDnA==?= =?UTF-8?B?IMOkw6vDr8O2w7w=?=",
"ÄËÏÖÜ äëïöü",
},
{
"=?ISO-8859-2?B?xMtJ1tw=?= =?ISO-8859-2?B?IOTrafb8?=",
"ÄËIÖÜ äëiöü",
},
{
"=?uknown?B?xMtJ1tw=?= =?ISO-8859-2?B?IOTrafb8?=",
"=?uknown?B?xMtJ1tw=?= =?ISO-8859-2?B?IOTrafb8?=",
},
}
for _, val := range testData {
if decoded, err := DecodeHeader(val.raw); strings.Compare(val.expected, decoded) != 0 {
t.Errorf("Incorrect decoding of header %q expected %q but have %q; Error %v", val.raw, val.expected, decoded, err)
}
}
}
type testParseMediaTypeData struct {
arg, wantMediaType string
wantParams map[string]string
}
func (d *testParseMediaTypeData) run(t *testing.T) {
gotMediaType, params, err := ParseMediaType(d.arg)
a.Nil(t, err)
a.Equal(t, d.wantMediaType, gotMediaType)
a.Equal(t, d.wantParams, params)
}
func TestParseMediaType(t *testing.T) {
testTable := map[string]testParseMediaTypeData{
"TwiceTheSameParameter": {
arg: "attachment; filename=joy.txt; filename=JOY.TXT; title=hi;",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "JOY.TXT", "title": "hi"},
},
"SingleLineUTF8": {
arg: "attachment;\nfilename*=utf-8''%F0%9F%98%81%F0%9F%98%82.txt;\n title=smile",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "😁😂.txt", "title": "smile"},
},
"MultiLineUTF8": {
arg: "attachment;\nfilename*0*=utf-8''%F0%9F%98%81; title=smile;\nfilename*1*=%F0%9F%98%82;\nfilename*2=.txt",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "😁😂.txt", "title": "smile"},
},
"MultiLineFirstNoEncNextUTF8": {
arg: "attachment;\nfilename*0*=utf-8''joy ;\n title*=utf-8''smile; \nfilename*1*=%F0%9F%98%82;\nfilename*2=.txt",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "joy😂.txt", "title": "smile"},
},
"SingleLineBig5": {
arg: "attachment;\nfilename*=big5''%B3%C6%A7%D1%BF%FD.m4a; title*=utf8''memorandum",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "備忘錄.m4a", "title": "memorandum"},
},
"MultiLineBig5": {
arg: "attachment;\nfilename*0*=big5''%B3%C6a; title*0=utf8''memorandum; filename*2=%BF%FD.m4a; \nfilename*1*=%A7%D1b;",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "備a忘b錄.m4a", "title": "memorandum"},
},
2020-06-30 14:33:29 +00:00
"SingleLineBadEncoding": {
arg: "attachment;\nfilename*=utf-8'%F0%9F%98%81%F0%9F%98%82.txt;\n title=smile",
wantMediaType: "attachment",
wantParams: map[string]string{"title": "smile"},
},
2020-06-30 14:33:29 +00:00
"MultiLineBadEncoding": {
arg: "attachment;\nfilename*0*=utf-8'%F0%9F%98%81; title=smile;\nfilename*1*=%F0%9F%98%82;\nfilename*2=.txt",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "😂.txt", "title": "smile"},
},
}
for name, testData := range testTable {
t.Run(name, testData.run)
}
}
func TestGetEncoding(t *testing.T) {
// All MIME charsets with aliases can be found here:
// https://www.iana.org/assignments/character-sets/character-sets.xhtml
mimesets := map[string][]string{
2022-05-31 13:54:04 +00:00
"utf-8": { // MIB 16
"utf8",
"csutf8",
"unicode-1-1-utf-8",
"iso-utf-8",
"utf8mb4",
},
2022-05-31 13:54:04 +00:00
"gbk": {
"gb2312", // MIB 2025
//"euc-cn": []string{
"euccn",
"ibm-euccn",
},
//"utf7": []string{"utf-7", "unicode-1-1-utf-7"},
2022-05-31 13:54:04 +00:00
"iso-8859-2": { // MIB 5
"iso-ir-101",
"iso_8859-2",
"iso8859-2",
"latin2",
"l2",
"csisolatin2",
"ibm852",
//"FAILEDibm852",
},
2022-05-31 13:54:04 +00:00
"iso-8859-3": { // MIB 6
"iso-ir-109",
"iso_8859-3",
"latin3",
"l3",
"csisolatin3",
},
2022-05-31 13:54:04 +00:00
"iso-8859-4": { // MIB 7
"iso-ir-110",
"iso_8859-4",
"latin4",
"l4",
"csisolatin4",
},
2022-05-31 13:54:04 +00:00
"iso-8859-5": { // MIB 8
"iso-ir-144",
"iso_8859-5",
"cyrillic",
"csisolatincyrillic",
},
2022-05-31 13:54:04 +00:00
"iso-8859-6": { // MIB 9
"iso-ir-127",
"iso_8859-6",
"ecma-114",
"asmo-708",
"arabic",
"csisolatinarabic",
//"iso-8859-6e": []string{ // MIB 81 just direction
"csiso88596e",
"iso-8859-6-e",
//"iso-8859-6i": []string{ // MIB 82
"csiso88596i",
2022-05-31 13:54:04 +00:00
"iso-8859-6-i",
},
"iso-8859-7": { // MIB 10
"iso-ir-126",
"iso_8859-7",
"elot_928",
"ecma-118",
"greek",
"greek8",
2022-05-31 13:54:04 +00:00
"csisolatingreek",
},
"iso-8859-8": { // MIB 11
"iso-ir-138",
"iso_8859-8",
"hebrew",
"csisolatinhebrew",
//"iso-8859-8e": []string{ // MIB 84 (directionality
"csiso88598e",
"iso-8859-8-e",
},
2022-05-31 13:54:04 +00:00
"iso-8859-8-i": { // MIB 85
"logical",
"csiso88598i",
"iso-8859-8-i", // Hebrew, the "i" means right-to-left, probably unnecessary with ISO cleaning above.
},
2022-05-31 13:54:04 +00:00
"iso-8859-10": { // MIB 13
"iso-ir-157",
"l6",
"iso_8859-10:1992",
"csisolatin6",
2022-05-31 13:54:04 +00:00
"latin6",
},
"iso-8859-13": { // MIB 109
"csiso885913"},
2022-05-31 13:54:04 +00:00
"iso-8859-14": { // MIB 110
"iso-ir-199",
"iso_8859-14:1998",
"iso_8859-14",
"latin8",
"iso-celtic",
"l8",
2022-05-31 13:54:04 +00:00
"csiso885914",
},
"iso-8859-15": { // MIB 111
"iso_8859-15",
"latin-9",
"csiso885915",
2022-05-31 13:54:04 +00:00
"ISO8859-15",
},
"iso-8859-16": { // MIB 112
"iso-ir-226",
"iso_8859-16:2001",
"iso_8859-16",
"latin10",
"l10",
"csiso885916",
},
2022-05-31 13:54:04 +00:00
"windows-874": { // MIB 2109
"cswindows874",
"cp874",
"iso-8859-11",
"tis-620",
},
2022-05-31 13:54:04 +00:00
"windows-1250": { // MIB 2250
"cswindows1250",
"cp1250",
},
2022-05-31 13:54:04 +00:00
"windows-1251": { // MIB 2251
"cswindows1251",
"cp1251",
},
2022-05-31 13:54:04 +00:00
"windows-1252": { // MIB 2252
"cswindows1252",
"cp1252",
"3dwindows-1252",
"we8mswin1252",
"us-ascii", // MIB 3
"ansi_x3.110-1983", // MIB 74 // usascii
//"iso-8859-1": []string{ // MIB 4 succeed by win1252
"iso8859-1",
"iso-ir-100",
"iso_8859-1",
"latin1",
"l1",
"ibm819",
"cp819",
"csisolatin1",
"ansi_x3.4-1968",
"ansi_x3.4-1986",
"cp850",
"cp858", // "cp850" Mostly correct except for the Euro sign.
"iso_646.irv:1991",
"iso646-us",
"us",
"ibm367",
"cp367",
"csascii",
"ascii",
"iso-ir-6",
"we8iso8859p1",
},
2022-05-31 13:54:04 +00:00
"windows-1253": {"cswindows1253", "cp1253"}, // MIB 2253
"windows-1254": {"cswindows1254", "cp1254"}, // MIB 2254
"windows-1255": {"cSwindows1255", "cp1255"}, // MIB 2255
"windows-1256": {"cswIndows1256", "cp1256"}, // MIB 2256
"windows-1257": {"cswinDows1257", "cp1257"}, // MIB 2257
"windows-1258": {"cswindoWs1258", "cp1258"}, // MIB 2257
"koi8-r": {"cskoi8r", "koi8r"}, // MIB 2084
"koi8-u": {"cskoi8u", "koi8u"}, // MIB 2088
"macintosh": {"mac", "macroman", "csmacintosh"}, // MIB 2027
"big5": {
"zht16mswin950", // cp950
"cp950",
},
2022-05-31 13:54:04 +00:00
"euc-kr": {
"euckr", // MIB 38
"ibm-euckr",
//"uhc": []string{ // Korea
"ks_c_5601-1987",
"ksc5601",
"cp949",
},
2022-05-31 13:54:04 +00:00
"euc-jp": {
"eucjp",
"ibm-eucjp",
},
2022-05-31 13:54:04 +00:00
"shift_jis": {
"CP932",
"MS932",
"Windows-932",
"Windows-31J",
"MS_Kanji",
"IBM-943",
"CP943",
},
2022-05-31 13:54:04 +00:00
"iso-2022-jp": { // MIB 39
"iso2022jp",
"csiso2022jp",
},
}
for expected, names := range mimesets {
expenc, _ := htmlindex.Get(expected)
if canonical, err := htmlindex.Name(expenc); canonical != expected || err != nil {
t.Fatalf("Error while get canonical name. Expected '%v' but have %v `%#v`: %v", expected, canonical, expenc, err)
}
for _, name := range names {
enc, err := getEncoding(name)
if err != nil || enc == nil {
t.Errorf("Error while getting encoding for %v returned: '%#v' and error: '%v'", name, enc, err)
}
if expenc != enc {
t.Errorf("For %v expected %v '%v' but have '%v'", name, expected, expenc, enc)
}
}
}
}
// sample text for UTF8 http://www.columbia.edu/~fdc/utf8/index.html
func TestEncodeReader(t *testing.T) {
// define test data
testData := []struct {
charset string
original []byte
message string
}{
// russian
{
"koi8-r",
// а, з, б, у, к, а, а, б, в, г, д, е, ё
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
"азбукаабвгдеё",
},
{
"KOI8-R",
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
"азбукаабвгдеё",
},
{
"csKOI8R",
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
"азбукаабвгдеё",
},
{
"koi8-u",
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
"азбукаабвгдеё",
},
{
"iso-8859-5",
// а , з , б , у , к , а , а , б , в , г , д , е , ё
[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xF1},
"азбукаабвгдеё",
},
{
"csWrong",
[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6},
"",
},
{
"utf8",
[]byte{0xD0, 0xB0, 0xD0, 0xB7, 0xD0, 0xB1, 0xD1, 0x83, 0xD0, 0xBA, 0xD0, 0xB0, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD1, 0x91},
"азбукаабвгдеё",
},
// czechoslovakia
{
"windows-1250",
[]byte{225, 228, 232, 233, 236, 244},
"áäčéěô",
},
// umlauts
{
"iso-8859-1",
[]byte{196, 203, 214, 220, 228, 235, 246, 252},
"ÄËÖÜäëöü",
},
// latvia
{
"iso-8859-4",
[]byte{224, 239, 243, 182, 254},
"āīķļū",
},
{ // encoded by https://www.motobit.com/util/charset-codepage-conversion.asp
"utf7",
[]byte("He wes Leovena+APA-es sone -- li+APA-e him be Drihten.+A6QDtw- +A7MDuwPOA8MDwwOx- +A7wDvwPF- +A60DtAPJA8MDsQO9- +A7UDuwO7A7cDvQO5A7oDrg-. +BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+C68LvguuC7ELvwuoC80LpA- +C64Lygu0C78LlQuzC78LsgvH- +C6QLrgu/C7QLzQuuC8oLtAu/- +C6oLywuyC80- +C4cLqQu/C6QLvgu1C6QLwQ- +C44LmQvNC5ULwQuuC80- +C5ULvgujC8sLrgvN-."),
"He wes Leovenaðes sone -- liðe him be Drihten.Τη γλώσσα μου έδωσαν ελληνική. Чернели избы здесь и там,Чернели избы здесь и там,யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம்.",
},
// iconv -f UTF8 -t GB2312 utf8.txt | hexdump -v -e '"0x" 1/1 "%x, "'
{ // encoded by iconv; dump by `cat gb2312.txt | hexdump -v -e '"0x" 1/1 "%x "'` and reformat; text from https://zh.wikipedia.org/wiki/GB_2312
"GB2312",
[]byte{0x47, 0x42, 0x20, 0x32, 0x33, 0x31, 0x32, 0xb5, 0xc4, 0xb3, 0xf6, 0xcf, 0xd6, 0xa3, 0xac, 0xbb, 0xf9, 0xb1, 0xbe, 0xc2, 0xfa, 0xd7, 0xe3, 0xc1, 0xcb, 0xba, 0xba, 0xd7, 0xd6, 0xb5, 0xc4, 0xbc, 0xc6, 0xcb, 0xe3, 0xbb, 0xfa, 0xb4, 0xa6, 0xc0, 0xed, 0xd0, 0xe8, 0xd2, 0xaa, 0xa3, 0xac, 0xcb, 0xfc, 0xcb, 0xf9, 0xca, 0xd5, 0xc2, 0xbc, 0xb5, 0xc4, 0xba, 0xba, 0xd7, 0xd6, 0xd2, 0xd1, 0xbe, 0xad, 0xb8, 0xb2, 0xb8, 0xc7, 0xd6, 0xd0, 0xb9, 0xfa, 0xb4, 0xf3, 0xc2, 0xbd, 0x39, 0x39, 0x2e, 0x37, 0x35, 0x25, 0xb5, 0xc4, 0xca, 0xb9, 0xd3, 0xc3, 0xc6, 0xb5, 0xc2, 0xca, 0xa1, 0xa3, 0xb5, 0xab, 0xb6, 0xd4, 0xd3, 0xda, 0xc8, 0xcb, 0xc3, 0xfb},
"GB 2312的出现基本满足了汉字的计算机处理需要它所收录的汉字已经覆盖中国大陆99.75%的使用频率。但对于人名",
},
{ // encoded by iconv; text from https://jp.wikipedia.org/wiki/Shift_JIS
"shift-jis",
[]byte{0x95, 0xb6, 0x8e, 0x9a, 0x95, 0x84, 0x8d, 0x86, 0x89, 0xbb, 0x95, 0xfb, 0x8e, 0xae, 0x53, 0x68, 0x69, 0x66, 0x74, 0x5f, 0x4a, 0x49, 0x53, 0x82, 0xcc, 0x90, 0xdd, 0x8c, 0x76, 0x8e, 0xd2, 0x82, 0xe7, 0x82, 0xcd, 0x81, 0x41, 0x90, 0xe6, 0x8d, 0x73, 0x82, 0xb5, 0x82, 0xc4, 0x82, 0xe6, 0x82, 0xad, 0x97, 0x98, 0x97, 0x70, 0x82, 0xb3, 0x82, 0xea, 0x82, 0xc4, 0x82, 0xa2, 0x82, 0xbd, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x30, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x31, 0x81, 0x6a, 0x82, 0xcc, 0x38, 0x83, 0x72, 0x83, 0x62, 0x83, 0x67, 0x95, 0x84, 0x8d, 0x86, 0x81, 0x69, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x89, 0x70, 0x90, 0x94, 0x8e, 0x9a, 0x81, 0x45, 0x94, 0xbc, 0x8a, 0x70, 0x83, 0x4a, 0x83, 0x69, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xc6, 0x81, 0x41, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x36, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x38, 0x81, 0x41, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x8a, 0xbf, 0x8e, 0x9a, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xcc, 0x97, 0xbc, 0x95, 0xb6, 0x8e, 0x9a, 0x8f, 0x57, 0x8d, 0x87, 0x82, 0xf0, 0x95, 0x5c, 0x8c, 0xbb, 0x82, 0xb5, 0x82, 0xe6, 0x82, 0xa4, 0x82, 0xc6, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42, 0x82, 0xdc, 0x82, 0xbd, 0x81, 0x41, 0x83, 0x74, 0x83, 0x40, 0x83, 0x43, 0x83, 0x8b, 0x82, 0xcc, 0x91, 0xe5, 0x82, 0xab, 0x82, 0xb3, 0x82, 0xe2, 0x8f, 0x88, 0x97, 0x9d, 0x8e, 0x9e, 0x8a, 0xd4, 0x82, 0xcc, 0x92, 0x5a, 0x8f, 0x6b, 0x82, 0xf0, 0x90, 0x7d, 0x82, 0xe9, 0x82, 0xbd, 0x82, 0xdf, 0x81, 0x41, 0x83, 0x47, 0x83, 0x58, 0x83, 0x50, 0x81, 0x5b, 0x83, 0x76, 0x83, 0x56, 0x81, 0x5b, 0x83, 0x50, 0x83, 0x93, 0x83, 0x58, 0x82, 0xc8, 0x82, 0xb5, 0x82, 0xc5, 0x8d, 0xac, 0x8d, 0xdd, 0x89, 0xc2, 0x94, 0x5c, 0x82, 0xc9, 0x82, 0xb7, 0x82, 0xe9, 0x82, 0xb1, 0x82, 0xc6, 0x82, 0xf0, 0x8a, 0xe9, 0x90, 0x7d, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42},
"文字符号化方式Shift_JISの設計者らは、先行してよく利用されていたJIS C 6220現在のJIS X 0201の8ビット符号以下「英数字・半角カナ」と、JIS C 6226現在のJIS X 0208、以下「漢字」の両文字集合を表現しようとした。また、ファイルの大きさや処理時間の短縮を図るため、エスケープシーケンスなしで混在可能にすることを企図した。",
},
// add more from mutations of https://en.wikipedia.org/wiki/World_Wide_Web
}
// run tests
for _, val := range testData {
2022-05-31 13:54:04 +00:00
// fmt.Println("Testing ", val)
expected := []byte(val.message)
decoded, err := DecodeCharset(val.original, "text/plain; charset="+val.charset)
if len(expected) == 0 {
if err == nil {
t.Error("Expected err but have ", err)
} else {
2022-05-31 13:54:04 +00:00
// fmt.Println("Expected err: ", err)
continue
}
} else {
if err != nil {
t.Error("Expected ok but have ", err)
}
}
if bytes.Equal(decoded, expected) {
2023-07-07 12:41:10 +00:00
// fmt.Println("Successful decoding of ", val.params, ":", string(decoded))
} else {
t.Error("Wrong encoding of ", val.charset, ".Expected\n", expected, "\nbut have\n", decoded)
}
if strings.Compare(val.message, string(decoded)) != 0 {
t.Error("Wrong message for ", val.charset, ".Expected\n", val.message, "\nbut have\n", string(decoded))
}
}
}