feat: [GODT-360] detect charset embedded in html and xml

This commit is contained in:
James Houlahan 2020-05-28 12:36:42 +02:00
parent 84d344cb0a
commit 9e633400b0
8 changed files with 65 additions and 48 deletions

View File

@ -4,6 +4,9 @@ Changelog [format](http://keepachangelog.com/en/1.0.0/)
## Unreleased
### Added
* GODT-360 Detect charset embedded in html/xml.
### Changed
* GODT-388 Support for both bridge and import/export credentials by package users.
* GODT-387 Store factory to make store optional.

View File

@ -27,7 +27,6 @@ require (
github.com/chzyer/logex v1.1.10 // indirect
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 // indirect
github.com/cucumber/godog v0.8.1
github.com/danieljoos/wincred v1.0.2 // indirect
github.com/emersion/go-imap v0.0.0-20200415151653-89df427d2794
github.com/emersion/go-imap-appendlimit v0.0.0-20190308131241-25671c986a6a
github.com/emersion/go-imap-idle v0.0.0-20190519112320-2704abd7050e

View File

@ -5,8 +5,8 @@ github.com/ProtonMail/bcrypt v0.0.0-20170924085257-7509ea014998 h1:YT2uVwQiRQZxC
github.com/ProtonMail/bcrypt v0.0.0-20170924085257-7509ea014998/go.mod h1:HecWFHognK8GfRDGnFQbW/LiV7A3MX3gZVs45vk5h8I=
github.com/ProtonMail/crypto v0.0.0-20190604143603-d3d8a14a4d4f h1:cFhATQTJGK2iZ0dc+jRhr75mh6bsc5Ug6NliaBya8Kw=
github.com/ProtonMail/crypto v0.0.0-20190604143603-d3d8a14a4d4f/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
github.com/ProtonMail/docker-credential-helpers v1.0.0 h1:0DQXbZNvUszWgXUuP7TzvQdwnkK1D5Zf/glBgCFJFCk=
github.com/ProtonMail/docker-credential-helpers v1.0.0/go.mod h1:R1gQindzdYFcWJuuGXteYHDJzUCVtyU+EpEqp9aWcFs=
github.com/ProtonMail/docker-credential-helpers v1.1.0 h1:+kvUIpwWcbtP3WFv5sSvkFn/XLzSqPOB5AAthuk9xPk=
github.com/ProtonMail/docker-credential-helpers v1.1.0/go.mod h1:mK0aBveCxhnQ756AmaTfXMZDeULvheYVhF/MWMErN5g=
github.com/ProtonMail/go-appdir v1.1.0 h1:9hdNDlU9kTqRKVNzmoqah8qqrj5QZyLByQdwQNlFWig=
github.com/ProtonMail/go-appdir v1.1.0/go.mod h1:3d8Y9F5mbEUjrYbcJ3rcDxcWbqbttF+011nVZmdRdzc=
github.com/ProtonMail/go-apple-mobileconfig v0.0.0-20160701194735-7ea9927a11f6 h1:YsSJ/mvZFYydQm/hRrt8R8UtgETixN2y3LK98f5LT60=
@ -43,8 +43,8 @@ github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSY
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/cucumber/godog v0.8.1 h1:lVb+X41I4YDreE+ibZ50bdXmySxgRviYFgKY6Aw4XE8=
github.com/cucumber/godog v0.8.1/go.mod h1:vSh3r/lM+psC1BPXvdkSEuNjmXfpVqrMGYAElF6hxnA=
github.com/danieljoos/wincred v1.0.2 h1:zf4bhty2iLuwgjgpraD2E9UbvO+fe54XXGJbOwe23fU=
github.com/danieljoos/wincred v1.0.2/go.mod h1:SnuYRW9lp1oJrZX/dXJqr0cPK5gYXqx3EJbmjhLdK9U=
github.com/danieljoos/wincred v1.1.0 h1:3RNcEpBg4IhIChZdFRSdlQt1QjCp1sMAPIrOnm7Yf8g=
github.com/danieljoos/wincred v1.1.0/go.mod h1:XYlo+eRTsVA9aHGp7NGjFkPla4m+DCL7hqDjlFjiygg=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=

View File

@ -106,7 +106,7 @@ func combineParts(m *pmapi.Message, parts []io.Reader, headers []textproto.MIMEH
if b, err = ioutil.ReadAll(d); err != nil {
b, err = pmmime.DecodeCharset(b, params)
b, err = pmmime.DecodeCharset(b, contentType)
if err != nil {
log.Warn("Decode charset error: ", err)
return false, err

View File

@ -29,8 +29,9 @@ import (
@ -197,18 +198,26 @@ func EncodeHeader(s string) string {
// DecodeCharset decodes the orginal using content type parameters.
// When charset is missing it checks that the content is valid utf8.
// If it isn't, it checks whether the content is valid latin1 (iso-8859-1), and if so,
// reencodes it as utf-8.
func DecodeCharset(original []byte, contentTypeParams map[string]string) ([]byte, error) {
// If the charset is specified, use that.
if charset, ok := contentTypeParams["charset"]; ok {
decoder, err := selectDecoder(charset)
// If the charset parameter is missing it checks that the content is valid utf8.
// If it isn't, it checks if it's embedded in the html/xml.
// If it isn't, it falls back to windows-1252.
// It then reencodes it as utf-8.
func DecodeCharset(original []byte, contentType string) ([]byte, error) {
// If the contentType itself is specified, use that.
if contentType != "" {
_, params, err := ParseMediaType(contentType)
if err != nil {
return original, errors.Wrap(err, "unknown charset was specified")
return nil, err
return decoder.Bytes(original)
if charset, ok := params["charset"]; ok {
decoder, err := selectDecoder(charset)
if err != nil {
return original, errors.Wrap(err, "unknown charset was specified")
return decoder.Bytes(original)
// The charset was not specified. First try utf8.
@ -216,16 +225,22 @@ func DecodeCharset(original []byte, contentTypeParams map[string]string) ([]byte
return original, nil
// Fallback to latin1.
// In future this should fallback to whatever default encoding user specified.
decoded, err := charmap.ISO8859_1.NewDecoder().Bytes(original)
if err != nil {
return original, errors.Wrap(err, "failed to decode as latin1")
// encoding will be windows-1252 if it can't be determined properly.
encoding, name, certain := charset.DetermineEncoding(original, contentType)
if !certain {
logrus.WithField("encoding", name).Warn("Determined encoding but was not certain")
// If the decoded string is not valid utf8, it wasn't latin1, so give up.
// Reencode as UTF-8.
decoded, err := encoding.NewDecoder().Bytes(original)
if err != nil {
return original, errors.Wrap(err, "failed to decode as windows-1252")
// If the decoded string is not valid utf8, it wasn't windows-1252, so give up.
if !utf8.Valid(decoded) {
return original, errors.Wrap(err, "failed to decode as latin1")
return original, errors.Wrap(err, "failed to decode as windows-1252")
return decoded, nil

View File

@ -330,81 +330,81 @@ func TestGetEncoding(t *testing.T) {
func TestEncodeReader(t *testing.T) {
// define test data
testData := []struct {
params map[string]string
charset string
original []byte
message string
// russian
map[string]string{"charset": "koi8-r"},
// а, з, б, у, к, а, а, б, в, г, д, е, ё
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
map[string]string{"charset": "KOI8-R"},
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
map[string]string{"charset": "csKOI8R"},
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
map[string]string{"charset": "koi8-u"},
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
map[string]string{"charset": "iso-8859-5"},
// а , з , б , у , к , а , а , б , в , г , д , е , ё
[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xF1},
map[string]string{"charset": "csWrong"},
[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6},
map[string]string{"charset": "utf8"},
[]byte{0xD0, 0xB0, 0xD0, 0xB7, 0xD0, 0xB1, 0xD1, 0x83, 0xD0, 0xBA, 0xD0, 0xB0, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD1, 0x91},
// czechoslovakia
map[string]string{"charset": "windows-1250"},
[]byte{225, 228, 232, 233, 236, 244},
// umlauts
map[string]string{"charset": "iso-8859-1"},
[]byte{196, 203, 214, 220, 228, 235, 246, 252},
// latvia
map[string]string{"charset": "iso-8859-4"},
[]byte{224, 239, 243, 182, 254},
{ // encoded by https://www.motobit.com/util/charset-codepage-conversion.asp
map[string]string{"charset": "utf7"},
[]byte("He wes Leovena+APA-es sone -- li+APA-e him be Drihten.+A6QDtw- +A7MDuwPOA8MDwwOx- +A7wDvwPF- +A60DtAPJA8MDsQO9- +A7UDuwO7A7cDvQO5A7oDrg-. +BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+C68LvguuC7ELvwuoC80LpA- +C64Lygu0C78LlQuzC78LsgvH- +C6QLrgu/C7QLzQuuC8oLtAu/- +C6oLywuyC80- +C4cLqQu/C6QLvgu1C6QLwQ- +C44LmQvNC5ULwQuuC80- +C5ULvgujC8sLrgvN-."),
"He wes Leovenaðes sone -- liðe him be Drihten.Τη γλώσσα μου έδωσαν ελληνική. Чернели избы здесь и там,Чернели избы здесь и там,யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம்.",
// iconv -f UTF8 -t GB2312 utf8.txt | hexdump -v -e '"0x" 1/1 "%x, "'
{ // encoded by iconv; dump by `cat gb2312.txt | hexdump -v -e '"0x" 1/1 "%x "'` and reformat; text from https://zh.wikipedia.org/wiki/GB_2312
map[string]string{"charset": "GB2312"},
[]byte{0x47, 0x42, 0x20, 0x32, 0x33, 0x31, 0x32, 0xb5, 0xc4, 0xb3, 0xf6, 0xcf, 0xd6, 0xa3, 0xac, 0xbb, 0xf9, 0xb1, 0xbe, 0xc2, 0xfa, 0xd7, 0xe3, 0xc1, 0xcb, 0xba, 0xba, 0xd7, 0xd6, 0xb5, 0xc4, 0xbc, 0xc6, 0xcb, 0xe3, 0xbb, 0xfa, 0xb4, 0xa6, 0xc0, 0xed, 0xd0, 0xe8, 0xd2, 0xaa, 0xa3, 0xac, 0xcb, 0xfc, 0xcb, 0xf9, 0xca, 0xd5, 0xc2, 0xbc, 0xb5, 0xc4, 0xba, 0xba, 0xd7, 0xd6, 0xd2, 0xd1, 0xbe, 0xad, 0xb8, 0xb2, 0xb8, 0xc7, 0xd6, 0xd0, 0xb9, 0xfa, 0xb4, 0xf3, 0xc2, 0xbd, 0x39, 0x39, 0x2e, 0x37, 0x35, 0x25, 0xb5, 0xc4, 0xca, 0xb9, 0xd3, 0xc3, 0xc6, 0xb5, 0xc2, 0xca, 0xa1, 0xa3, 0xb5, 0xab, 0xb6, 0xd4, 0xd3, 0xda, 0xc8, 0xcb, 0xc3, 0xfb},
"GB 2312的出现基本满足了汉字的计算机处理需要它所收录的汉字已经覆盖中国大陆99.75%的使用频率。但对于人名",
{ // encoded by iconv; text from https://jp.wikipedia.org/wiki/Shift_JIS
map[string]string{"charset": "shift-jis"},
[]byte{0x95, 0xb6, 0x8e, 0x9a, 0x95, 0x84, 0x8d, 0x86, 0x89, 0xbb, 0x95, 0xfb, 0x8e, 0xae, 0x53, 0x68, 0x69, 0x66, 0x74, 0x5f, 0x4a, 0x49, 0x53, 0x82, 0xcc, 0x90, 0xdd, 0x8c, 0x76, 0x8e, 0xd2, 0x82, 0xe7, 0x82, 0xcd, 0x81, 0x41, 0x90, 0xe6, 0x8d, 0x73, 0x82, 0xb5, 0x82, 0xc4, 0x82, 0xe6, 0x82, 0xad, 0x97, 0x98, 0x97, 0x70, 0x82, 0xb3, 0x82, 0xea, 0x82, 0xc4, 0x82, 0xa2, 0x82, 0xbd, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x30, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x31, 0x81, 0x6a, 0x82, 0xcc, 0x38, 0x83, 0x72, 0x83, 0x62, 0x83, 0x67, 0x95, 0x84, 0x8d, 0x86, 0x81, 0x69, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x89, 0x70, 0x90, 0x94, 0x8e, 0x9a, 0x81, 0x45, 0x94, 0xbc, 0x8a, 0x70, 0x83, 0x4a, 0x83, 0x69, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xc6, 0x81, 0x41, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x36, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x38, 0x81, 0x41, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x8a, 0xbf, 0x8e, 0x9a, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xcc, 0x97, 0xbc, 0x95, 0xb6, 0x8e, 0x9a, 0x8f, 0x57, 0x8d, 0x87, 0x82, 0xf0, 0x95, 0x5c, 0x8c, 0xbb, 0x82, 0xb5, 0x82, 0xe6, 0x82, 0xa4, 0x82, 0xc6, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42, 0x82, 0xdc, 0x82, 0xbd, 0x81, 0x41, 0x83, 0x74, 0x83, 0x40, 0x83, 0x43, 0x83, 0x8b, 0x82, 0xcc, 0x91, 0xe5, 0x82, 0xab, 0x82, 0xb3, 0x82, 0xe2, 0x8f, 0x88, 0x97, 0x9d, 0x8e, 0x9e, 0x8a, 0xd4, 0x82, 0xcc, 0x92, 0x5a, 0x8f, 0x6b, 0x82, 0xf0, 0x90, 0x7d, 0x82, 0xe9, 0x82, 0xbd, 0x82, 0xdf, 0x81, 0x41, 0x83, 0x47, 0x83, 0x58, 0x83, 0x50, 0x81, 0x5b, 0x83, 0x76, 0x83, 0x56, 0x81, 0x5b, 0x83, 0x50, 0x83, 0x93, 0x83, 0x58, 0x82, 0xc8, 0x82, 0xb5, 0x82, 0xc5, 0x8d, 0xac, 0x8d, 0xdd, 0x89, 0xc2, 0x94, 0x5c, 0x82, 0xc9, 0x82, 0xb7, 0x82, 0xe9, 0x82, 0xb1, 0x82, 0xc6, 0x82, 0xf0, 0x8a, 0xe9, 0x90, 0x7d, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42},
"文字符号化方式Shift_JISの設計者らは、先行してよく利用されていたJIS C 6220現在のJIS X 0201の8ビット符号以下「英数字・半角カナ」と、JIS C 6226現在のJIS X 0208、以下「漢字」の両文字集合を表現しようとした。また、ファイルの大きさや処理時間の短縮を図るため、エスケープシーケンスなしで混在可能にすることを企図した。",
@ -417,7 +417,7 @@ func TestEncodeReader(t *testing.T) {
for _, val := range testData {
//fmt.Println("Testing ", val)
expected := []byte(val.message)
decoded, err := DecodeCharset(val.original, val.params)
decoded, err := DecodeCharset(val.original, "text/plain; charset="+val.charset)
if len(expected) == 0 {
if err == nil {
t.Error("Expected err but have ", err)
@ -434,10 +434,10 @@ func TestEncodeReader(t *testing.T) {
if bytes.Equal(decoded, expected) {
// fmt.Println("Succesfull decoding of ", val.params, ":", string(decoded))
} else {
t.Error("Wrong encoding of ", val.params, ".Expected\n", expected, "\nbut have\n", decoded)
t.Error("Wrong encoding of ", val.charset, ".Expected\n", expected, "\nbut have\n", decoded)
if strings.Compare(val.message, string(decoded)) != 0 {
t.Error("Wrong message for ", val.params, ".Expected\n", val.message, "\nbut have\n", string(decoded))
t.Error("Wrong message for ", val.charset, ".Expected\n", val.message, "\nbut have\n", string(decoded))

View File

@ -174,7 +174,7 @@ func convertHexToUTF(charset, value string) (string, error) {
if err != nil {
return "", err
utf8, err := DecodeCharset(raw, map[string]string{"charset": charset})
utf8, err := DecodeCharset(raw, "text/plain; charset="+charset)
return "utf-8''" + percentHexEscape(utf8), err

View File

@ -243,7 +243,7 @@ func getContentType(header textproto.MIMEHeader) (mediatype string, params map[s
contentType = "text/plain"
return mime.ParseMediaType(contentType)
return ParseMediaType(contentType)
// ===================== MIME Printer ===================================
@ -322,14 +322,14 @@ func NewPlainTextCollector(targetAccepter VisitAcceptor) *PlainTextCollector {
func (ptc *PlainTextCollector) Accept(partReader io.Reader, header textproto.MIMEHeader, hasPlainSibling bool, isFirst, isLast bool) (err error) {
if isFirst {
if IsLeaf(header) {
mediaType, params, _ := getContentType(header)
mediaType, _, _ := getContentType(header)
disp, _, _ := mime.ParseMediaType(header.Get("Content-Disposition"))
if mediaType == "text/plain" && disp != "attachment" {
partData, _ := ioutil.ReadAll(partReader)
decodedPart := decodePart(bytes.NewReader(partData), header)
if buffer, err := ioutil.ReadAll(decodedPart); err == nil {
buffer, err = DecodeCharset(buffer, params)
buffer, err = DecodeCharset(buffer, header.Get("Content-Type"))
if err != nil {
log.Warnln("Decode charset error:", err)
return err
@ -377,13 +377,13 @@ func (bc *BodyCollector) Accept(partReader io.Reader, header textproto.MIMEHeade
// TODO: Collect html and plaintext - if there's html with plain sibling don't include plain/text.
if isFirst {
if IsLeaf(header) {
mediaType, params, _ := getContentType(header)
mediaType, _, _ := getContentType(header)
disp, _, _ := mime.ParseMediaType(header.Get("Content-Disposition"))
if disp != "attachment" {
partData, _ := ioutil.ReadAll(partReader)
decodedPart := decodePart(bytes.NewReader(partData), header)
if buffer, err := ioutil.ReadAll(decodedPart); err == nil {
buffer, err = DecodeCharset(buffer, params)
buffer, err = DecodeCharset(buffer, header.Get("Content-Type"))
if err != nil {
log.Warnln("Decode charset error:", err)
return err
@ -444,14 +444,14 @@ func NewAttachmentsCollector(targetAccepter VisitAcceptor) *AttachmentsCollector
func (ac *AttachmentsCollector) Accept(partReader io.Reader, header textproto.MIMEHeader, hasPlainSibling bool, isFirst, isLast bool) (err error) {
if isFirst {
if IsLeaf(header) {
mediaType, params, _ := getContentType(header)
mediaType, _, _ := getContentType(header)
disp, _, _ := mime.ParseMediaType(header.Get("Content-Disposition"))
if (mediaType != "text/html" && mediaType != "text/plain") || disp == "attachment" {
partData, _ := ioutil.ReadAll(partReader)
decodedPart := decodePart(bytes.NewReader(partData), header)
if buffer, err := ioutil.ReadAll(decodedPart); err == nil {
buffer, err = DecodeCharset(buffer, params)
buffer, err = DecodeCharset(buffer, header.Get("Content-Type"))
if err != nil {
log.Warnln("Decode charset error:", err)
return err