proton-bridge/pkg/message/parser.go

823 lines
21 KiB
Go

// Copyright (c) 2024 Proton AG
//
// This file is part of Proton Mail Bridge.
//
// Proton Mail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Proton Mail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Proton Mail Bridge. If not, see <https://www.gnu.org/licenses/>.
package message
import (
"bytes"
"fmt"
"io"
"mime"
"net/mail"
"regexp"
"strings"
"github.com/ProtonMail/gluon/rfc5322"
"github.com/ProtonMail/gluon/rfc822"
"github.com/ProtonMail/go-proton-api"
"github.com/ProtonMail/proton-bridge/v3/pkg/message/parser"
pmmime "github.com/ProtonMail/proton-bridge/v3/pkg/mime"
"github.com/emersion/go-message"
"github.com/google/uuid"
"github.com/jaytaylor/html2text"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
type MIMEBody string
type Body string
type Message struct {
MIMEBody MIMEBody
RichBody Body
PlainBody Body
Attachments []Attachment
MIMEType rfc822.MIMEType
IsReply bool
Subject string
Sender *mail.Address
ToList []*mail.Address
CCList []*mail.Address
BCCList []*mail.Address
ReplyTos []*mail.Address
References []string
ExternalID string
InReplyTo string
XForward string
}
type Attachment struct {
Header mail.Header
Name string
ContentID string
MIMEType string
MIMEParams map[string]string
Disposition proton.Disposition
Data []byte
}
// Parse parses an RFC822 message.
func Parse(r io.Reader) (m Message, err error) {
return parseIOReaderImpl(r, false)
}
// ParseAndAllowInvalidAddressLists parses an RFC822 message and allows email address lists to be invalid.
func ParseAndAllowInvalidAddressLists(r io.Reader) (m Message, err error) {
return parseIOReaderImpl(r, true)
}
func parseIOReaderImpl(r io.Reader, allowInvalidAddressLists bool) (m Message, err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("panic while parsing message: %v", r)
}
}()
p, err := parser.New(r)
if err != nil {
return Message{}, errors.Wrap(err, "failed to create new parser")
}
return parse(p, allowInvalidAddressLists)
}
// ParseWithParser parses an RFC822 message using an existing parser.
func ParseWithParser(p *parser.Parser, allowInvalidAddressLists bool) (m Message, err error) {
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("panic while parsing message: %v", r)
}
}()
return parse(p, allowInvalidAddressLists)
}
func parse(p *parser.Parser, allowInvalidAddressLists bool) (Message, error) {
if err := convertEncodedTransferEncoding(p); err != nil {
return Message{}, errors.Wrap(err, "failed to convert encoded transfer encoding")
}
if err := convertForeignEncodings(p); err != nil {
return Message{}, errors.Wrap(err, "failed to convert foreign encodings")
}
if err := patchInlineImages(p); err != nil {
return Message{}, errors.Wrap(err, "patching inline images failed")
}
m, err := parseMessageHeader(p.Root().Header, allowInvalidAddressLists)
if err != nil {
return Message{}, errors.Wrap(err, "failed to parse message header")
}
atts, err := collectAttachments(p)
if err != nil {
return Message{}, errors.Wrap(err, "failed to collect attachments")
}
m.Attachments = atts
richBody, plainBody, err := buildBodies(p)
if err != nil {
return Message{}, errors.Wrap(err, "failed to build bodies")
}
mimeBody, err := buildMIMEBody(p)
if err != nil {
return Message{}, errors.Wrap(err, "failed to build mime body")
}
m.RichBody = Body(richBody)
m.PlainBody = Body(plainBody)
m.MIMEBody = MIMEBody(mimeBody)
mimeType, err := determineBodyMIMEType(p)
if err != nil {
return Message{}, errors.Wrap(err, "failed to get mime type")
}
m.MIMEType = rfc822.MIMEType(mimeType)
return m, nil
}
// buildMIMEBody builds mime body from the parser returned by NewParser.
func buildMIMEBody(p *parser.Parser) (mimeBody string, err error) {
buf := new(bytes.Buffer)
if err := p.NewWriter().Write(buf); err != nil {
return "", fmt.Errorf("failed to write message: %w", err)
}
return buf.String(), nil
}
// convertEncodedTransferEncoding decodes any RFC2047-encoded content transfer encodings.
// Such content transfer encodings go against RFC but still exist in the wild anyway.
func convertEncodedTransferEncoding(p *parser.Parser) error {
logrus.Trace("Converting encoded transfer encoding")
return p.NewWalker().
RegisterDefaultHandler(func(p *parser.Part) error {
encoding := p.Header.Get("Content-Transfer-Encoding")
if encoding == "" {
return nil
}
dec, err := pmmime.WordDec.DecodeHeader(encoding)
if err != nil {
return err
}
p.Header.Set("Content-Transfer-Encoding", dec)
return nil
}).
Walk()
}
func convertForeignEncodings(p *parser.Parser) error {
logrus.Trace("Converting foreign encodings")
return p.NewWalker().
RegisterContentTypeHandler("text/html", func(p *parser.Part) error {
if err := p.ConvertToUTF8(); err != nil {
return err
}
return p.ConvertMetaCharset()
}).
RegisterContentTypeHandler("text/.*", func(p *parser.Part) error {
if p.IsAttachment() {
return nil
}
return p.ConvertToUTF8()
}).
Walk()
}
func collectAttachments(p *parser.Parser) ([]Attachment, error) {
var (
atts []Attachment
err error
)
w := p.NewWalker().
RegisterContentDispositionHandler("attachment", func(p *parser.Part) error {
att, err := parseAttachment(p.Header, p.Body)
if err != nil {
return err
}
atts = append(atts, att)
return nil
}).
RegisterContentTypeHandler("text/calendar", func(p *parser.Part) error {
att, err := parseAttachment(p.Header, p.Body)
if err != nil {
return err
}
atts = append(atts, att)
return nil
}).
RegisterContentTypeHandler("text/.*", func(p *parser.Part) error {
return nil
}).
RegisterDefaultHandler(func(p *parser.Part) error {
if len(p.Children()) > 0 {
return nil
}
att, err := parseAttachment(p.Header, p.Body)
if err != nil {
return err
}
atts = append(atts, att)
return nil
})
if err = w.Walk(); err != nil {
return nil, err
}
return atts, nil
}
// buildBodies collects all text/html and text/plain parts and returns two bodies,
// - a rich text body (in which html is allowed), and
// - a plaintext body (in which html is converted to plaintext).
//
// text/html parts are converted to plaintext in order to build the plaintext body,
// unless there is already a plaintext part provided via multipart/alternative,
// in which case the provided alternative is chosen.
func buildBodies(p *parser.Parser) (richBody, plainBody string, err error) {
richParts, err := collectBodyParts(p, "text/html")
if err != nil {
return
}
plainParts, err := collectBodyParts(p, "text/plain")
if err != nil {
return
}
richBuilder, plainBuilder := strings.Builder{}, strings.Builder{}
for _, richPart := range richParts {
_, _ = richBuilder.Write(richPart.Body)
}
for _, plainPart := range plainParts {
_, _ = plainBuilder.Write(getPlainBody(plainPart))
}
return richBuilder.String(), plainBuilder.String(), nil
}
// collectBodyParts collects all body parts in the parse tree, preferring
// parts of the given content type if alternatives exist.
func collectBodyParts(p *parser.Parser, preferredContentType string) (parser.Parts, error) {
v := p.
NewVisitor(func(p *parser.Part, visit parser.Visit) (interface{}, error) {
childParts, err := collectChildParts(p, visit)
if err != nil {
return nil, err
}
return joinChildParts(childParts), nil
}).
RegisterRule("multipart/alternative", func(p *parser.Part, visit parser.Visit) (interface{}, error) {
childParts, err := collectChildParts(p, visit)
if err != nil {
return nil, err
}
return bestChoice(childParts, preferredContentType), nil
}).
RegisterRule("text/plain", func(p *parser.Part, visit parser.Visit) (interface{}, error) {
if p.IsAttachment() {
return parser.Parts{}, nil
}
return parser.Parts{p}, nil
}).
RegisterRule("text/html", func(p *parser.Part, visit parser.Visit) (interface{}, error) {
if p.IsAttachment() {
return parser.Parts{}, nil
}
return parser.Parts{p}, nil
})
res, err := v.Visit()
if err != nil {
return nil, err
}
return res.(parser.Parts), nil //nolint:forcetypeassert
}
func collectChildParts(p *parser.Part, visit parser.Visit) ([]parser.Parts, error) {
childParts := []parser.Parts{}
for _, child := range p.Children() {
res, err := visit(child)
if err != nil {
return nil, err
}
childParts = append(childParts, res.(parser.Parts)) //nolint:forcetypeassert
}
return childParts, nil
}
func joinChildParts(childParts []parser.Parts) parser.Parts {
res := parser.Parts{}
for _, parts := range childParts {
res = append(res, parts...)
}
return res
}
func bestChoice(childParts []parser.Parts, preferredContentType string) parser.Parts {
// If one of the parts has preferred content type, use that.
for i := len(childParts) - 1; i >= 0; i-- {
if allPartsHaveContentType(childParts[i], preferredContentType) {
return childParts[i]
}
}
// Otherwise, choose the last one, if it exists.
if len(childParts) > 0 {
return childParts[len(childParts)-1]
}
return parser.Parts{}
}
func allPartsHaveContentType(parts parser.Parts, contentType string) bool {
if len(parts) == 0 {
return false
}
for _, part := range parts {
t, _, err := part.ContentType()
if err != nil {
return false
}
if t != contentType {
return false
}
}
return true
}
func determineBodyMIMEType(p *parser.Parser) (string, error) {
var isHTML bool
w := p.NewWalker().
RegisterContentTypeHandler("text/html", func(p *parser.Part) (err error) {
isHTML = true
return
})
if err := w.WalkSkipAttachment(); err != nil {
return "", err
}
if isHTML {
return "text/html", nil
}
return "text/plain", nil
}
// getPlainBody returns the body of the given part, converting html to
// plaintext where possible.
func getPlainBody(part *parser.Part) []byte {
contentType, _, err := part.ContentType()
if err != nil {
return part.Body
}
switch contentType {
case "text/html":
text, err := html2text.FromReader(bytes.NewReader(part.Body))
if err != nil {
return part.Body
}
return []byte(text)
default:
return part.Body
}
}
func parseMessageHeader(h message.Header, allowInvalidAddressLists bool) (Message, error) {
var m Message
for fields := h.Fields(); fields.Next(); {
switch strings.ToLower(fields.Key()) {
case "subject":
s, err := fields.Text()
if err != nil {
if s, err = pmmime.DecodeHeader(fields.Value()); err != nil {
return Message{}, errors.Wrap(err, "failed to parse subject")
}
}
m.Subject = s
case "from":
sender, err := rfc5322.ParseAddressList(fields.Value())
if err != nil {
if !allowInvalidAddressLists {
return Message{}, errors.Wrap(err, "failed to parse from")
}
logrus.WithError(err).Warn("failed to parse from")
}
if len(sender) > 0 {
m.Sender = sender[0]
}
case "to":
toList, err := rfc5322.ParseAddressList(fields.Value())
if err != nil {
if !allowInvalidAddressLists {
return Message{}, errors.Wrap(err, "failed to parse to")
}
logrus.WithError(err).Warn("failed to parse to")
}
m.ToList = toList
case "reply-to":
replyTos, err := rfc5322.ParseAddressList(fields.Value())
if err != nil {
if !allowInvalidAddressLists {
return Message{}, errors.Wrap(err, "failed to parse reply-to")
}
logrus.WithError(err).Warn("failed to parse reply-to")
}
m.ReplyTos = replyTos
case "cc":
ccList, err := rfc5322.ParseAddressList(fields.Value())
if err != nil {
if !allowInvalidAddressLists {
return Message{}, errors.Wrap(err, "failed to parse cc")
}
logrus.WithError(err).Warn("failed to parse cc")
}
m.CCList = ccList
case "bcc":
bccList, err := rfc5322.ParseAddressList(fields.Value())
if err != nil {
if !allowInvalidAddressLists {
return Message{}, errors.Wrap(err, "failed to parse bcc")
}
logrus.WithError(err).Warn("failed to parse bcc")
}
m.BCCList = bccList
case "message-id":
m.ExternalID = regexp.MustCompile("<(.*)>").ReplaceAllString(fields.Value(), "$1")
case "in-reply-to":
m.InReplyTo = regexp.MustCompile("<(.*)>").ReplaceAllString(fields.Value(), "$1")
case "x-forwarded-message-id":
m.XForward = regexp.MustCompile("<(.*)>").ReplaceAllString(fields.Value(), "$1")
case "references":
for _, ref := range strings.Fields(fields.Value()) {
for _, ref := range strings.Split(ref, ",") {
m.References = append(m.References, strings.Trim(ref, "<>"))
}
}
}
}
return m, nil
}
func parseAttachment(h message.Header, body []byte) (Attachment, error) {
att := Attachment{
Data: body,
}
mimeHeader, err := toMailHeader(h)
if err != nil {
return Attachment{}, err
}
att.Header = mimeHeader
mimeType, mimeTypeParams, err := pmmime.ParseMediaType(h.Get("Content-Type"))
if err != nil {
return Attachment{}, err
}
att.MIMEType = mimeType
att.MIMEParams = mimeTypeParams
// Prefer attachment name from filename param in content disposition.
// If not available, try to get it from name param in content type.
// Otherwise fallback to attachment.bin.
disp, dispParams, err := pmmime.ParseMediaType(h.Get("Content-Disposition"))
if err == nil {
att.Disposition = proton.Disposition(disp)
if filename, ok := dispParams["filename"]; ok {
att.Name = filename
}
}
if att.Name == "" {
if filename, ok := mimeTypeParams["name"]; ok {
att.Name = filename
} else if mimeType == string(rfc822.MessageRFC822) {
att.Name = "message.eml"
} else if ext, err := mime.ExtensionsByType(att.MIMEType); err == nil && len(ext) > 0 {
att.Name = "attachment" + ext[0]
} else {
att.Name = "attachment.bin"
}
}
// Only set ContentID if it should be inline;
// API infers content disposition based on whether ContentID is present.
// If Content-Disposition is present, we base our decision on that.
// Otherwise, if Content-Disposition is missing but there is a ContentID, set it.
// (This is necessary because some clients don't set Content-Disposition at all,
// so we need to rely on other information to deduce if it's inline or attachment.)
if h.Has("Content-Disposition") {
disp, _, err := pmmime.ParseMediaType(h.Get("Content-Disposition"))
if err != nil {
return Attachment{}, err
}
if disp == string(proton.InlineDisposition) {
att.ContentID = strings.Trim(h.Get("Content-Id"), " <>")
}
} else if h.Has("Content-Id") {
att.ContentID = strings.Trim(h.Get("Content-Id"), " <>")
}
return att, nil
}
func toMailHeader(h message.Header) (mail.Header, error) {
mimeHeader := make(mail.Header)
if err := forEachDecodedHeaderField(h, func(key, val string) error {
mimeHeader[key] = []string{val}
return nil
}); err != nil {
return nil, err
}
return mimeHeader, nil
}
func forEachDecodedHeaderField(h message.Header, fn func(string, string) error) error {
fields := h.Fields()
for fields.Next() {
text, err := fields.Text()
if err != nil {
if !message.IsUnknownCharset(err) {
return err
}
if text, err = pmmime.DecodeHeader(fields.Value()); err != nil {
return err
}
}
if err := fn(fields.Key(), text); err != nil {
return err
}
}
return nil
}
func patchInlineImages(p *parser.Parser) error {
// This code will only attempt to patch the root level children. I tested with different email clients and as soon
// as you reply/forward a message the entire content gets converted into HTML (Apple Mail/Thunderbird/Evolution).
// If you are forcing text formatting (Evolution), the inline images of the original email are stripped.
// The only reason we need to apply this modification is that Apple Mail can send out text + inline image parts
// if the text does not exceed the 76 char column limit.
// Based on this, it's unlikely we will see any other variations.
root := p.Root()
children := root.Children()
if len(children) < 2 {
return nil
}
result := make([]inlinePatchJob, len(children))
var (
transformationNeeded bool
prevPart *parser.Part
prevContentType string
prevContentTypeMap map[string]string
)
for i := 0; i < len(children); i++ {
curPart := children[i]
contentType, contentTypeMap, err := curPart.ContentType()
if err != nil {
return fmt.Errorf("failed to get content type for for child %v:%w", i, err)
}
if rfc822.MIMEType(contentType) == rfc822.TextPlain {
result[i] = &inlinePatchBodyOnly{part: curPart, contentTypeMap: contentTypeMap}
} else if strings.HasPrefix(contentType, "image/") {
disposition, err := getImageContentDisposition(curPart)
if err != nil {
return fmt.Errorf("failed to get content disposition for child %v:%w", i, err)
}
if disposition == "inline" && !curPart.HasContentID() {
if rfc822.MIMEType(prevContentType) == rfc822.TextPlain {
result[i-1] = &inlinePatchBodyWithInlineImage{
textPart: prevPart,
imagePart: curPart,
textContentTypeMap: prevContentTypeMap,
}
} else {
result[i] = &inlinePatchInlineImageOnly{part: curPart, partIndex: i, root: root}
}
transformationNeeded = true
}
}
prevPart = curPart
prevContentType = contentType
prevContentTypeMap = contentTypeMap
}
if !transformationNeeded {
return nil
}
for _, t := range result {
if t != nil {
t.Patch()
}
}
return nil
}
func getImageContentDisposition(curPart *parser.Part) (string, error) {
disposition, _, err := curPart.ContentDisposition()
if err == nil {
return disposition, nil
}
if curPart.Header.Get("Content-Disposition") != "" {
return "", err
}
if curPart.HasContentID() {
return "inline", nil
}
return "attachment", nil
}
type inlinePatchJob interface {
Patch()
}
// inlinePatchBodyOnly is meant to be used for standalone text parts that need to be converted to html once we applty
// one of the changes.
type inlinePatchBodyOnly struct {
part *parser.Part
contentTypeMap map[string]string
}
func (i *inlinePatchBodyOnly) Patch() {
newBody := []byte(`<html><body><p>`)
newBody = append(newBody, patchNewLineWithHTMLBreaks(i.part.Body)...)
newBody = append(newBody, []byte(`</p></body></html>`)...)
i.part.Body = newBody
i.part.Header.SetContentType("text/html", i.contentTypeMap)
}
// inlinePatchBodyWithInlineImage patches a previous text part so that it refers to that inline image.
type inlinePatchBodyWithInlineImage struct {
textPart *parser.Part
textContentTypeMap map[string]string
imagePart *parser.Part
}
// inlinePatchInlineImageOnly handle the case where the inline image is not proceeded by a text part. To avoid
// having to parse any possible previous part, we just inject a new part that references this image.
type inlinePatchInlineImageOnly struct {
part *parser.Part
partIndex int
root *parser.Part
}
func (i inlinePatchInlineImageOnly) Patch() {
contentID := uuid.NewString()
// Convert previous part to text/html && inject image.
newBody := []byte(fmt.Sprintf(`<html><body><img src="cid:%v"/></body></html>`, contentID))
i.part.Header.Set("content-id", contentID)
// create new text part
textPart := &parser.Part{
Header: message.Header{},
Body: newBody,
}
textPart.Header.SetContentType("text/html", map[string]string{"charset": "UTF-8"})
i.root.InsertChild(i.partIndex, textPart)
}
func (i *inlinePatchBodyWithInlineImage) Patch() {
contentID := uuid.NewString()
// Convert previous part to text/html && inject image.
newBody := []byte(`<html><body><p>`)
newBody = append(newBody, patchNewLineWithHTMLBreaks(i.textPart.Body)...)
newBody = append(newBody, []byte(`</p>`)...)
newBody = append(newBody, []byte(fmt.Sprintf(`<img src="cid:%v"/>`, contentID))...)
newBody = append(newBody, []byte(`</body></html>`)...)
i.textPart.Body = newBody
i.textPart.Header.SetContentType("text/html", i.textContentTypeMap)
// Add content id to curPart
i.imagePart.Header.Set("content-id", contentID)
}
func patchNewLineWithHTMLBreaks(input []byte) []byte {
dst := make([]byte, 0, len(input))
index := 0
for {
slice := input[index:]
newLineIndex := bytes.IndexByte(slice, '\n')
if newLineIndex == -1 {
dst = append(dst, input[index:]...)
return dst
}
injectIndex := newLineIndex
if newLineIndex > 0 && slice[newLineIndex-1] == '\r' {
injectIndex--
}
dst = append(dst, slice[0:injectIndex]...)
dst = append(dst, '<', 'b', 'r', '/', '>')
dst = append(dst, slice[injectIndex:newLineIndex+1]...)
index += newLineIndex + 1
}
}