This commit is contained in:
Flavio Fois
2026-02-04 19:57:31 +01:00
parent 0d6157b2ff
commit 0cda0a26fc
25 changed files with 1549 additions and 66 deletions

View File

@@ -0,0 +1,261 @@
package internal
import (
"encoding/base64"
"fmt"
"io"
"os"
"strings"
"github.com/richardlehane/mscfb"
"golang.org/x/text/encoding/unicode"
"golang.org/x/text/transform"
)
// MAPI Property Tags
const (
prSubject = "0037"
prBody = "1000"
prBodyHTML = "1013"
prSenderName = "0C1A"
prSenderEmail = "0C1F"
prDisplayTo = "0E04" // Display list of To recipients
prDisplayCc = "0E03"
prDisplayBcc = "0E02"
prMessageHeaders = "007D"
prClientSubmitTime = "0039" // Date
prAttachLongFilename = "3707"
prAttachFilename = "3704"
prAttachData = "3701"
prAttachMimeTag = "370E"
)
// MAPI Property Types
const (
ptUnicode = "001F"
ptString8 = "001E"
ptBinary = "0102"
)
type msgParser struct {
reader *mscfb.Reader
props map[string][]byte
}
func parseMsgFile(filePath string) (*EmailData, error) {
f, err := os.Open(filePath)
if err != nil {
return nil, err
}
defer f.Close()
doc, err := mscfb.New(f)
if err != nil {
return nil, err
}
email := &EmailData{
To: []string{},
Cc: []string{},
Bcc: []string{},
}
// We need to iterate through the entries to find properties and attachments
// Since mscfb is a sequential reader, we might need to be careful.
// However, usually properties are in streams.
// Strategy:
// 1. Read all streams into a map keyed by their path/name for easier access?
// MSG files can be large (attachments), so maybe not all.
// 2. Identify properties from their stream names directly.
// Simplified approach: scan for stream names matching our patterns.
// Better approach:
// The Root Entry has "properties".
// We need to detect if we are in an attachment storage.
// Since mscfb iterates flat (Post-Order?), we can track context?
// mscfb File struct provides Name and path.
attachmentsMap := make(map[string]*EmailAttachment)
for entry, err := doc.Next(); err == nil; entry, err = doc.Next() {
name := entry.Name
// Check if it's a property stream
if strings.HasPrefix(name, "__substg1.0_") {
path := entry.Path // Path is array of directory names
// Root properties
if len(path) == 0 { // In root
val, err := io.ReadAll(doc)
if err != nil {
continue
}
processRootProperty(name, val, email)
} else if strings.HasPrefix(path[len(path)-1], "__attach_version1.0_") {
// Attachment property
attachStorageName := path[len(path)-1]
if _, exists := attachmentsMap[attachStorageName]; !exists {
attachmentsMap[attachStorageName] = &EmailAttachment{}
}
val, err := io.ReadAll(doc)
if err != nil {
continue
}
processAttachProperty(name, val, attachmentsMap[attachStorageName])
}
}
}
// Finalize attachments
for _, att := range attachmentsMap {
if strings.Contains(strings.ToLower(att.ContentType), "multipart/signed") {
dataStr := string(att.Data)
// Check if it already looks like a plain text EML (contains typical headers)
if strings.Contains(dataStr, "Content-Type:") || strings.Contains(dataStr, "MIME-Version:") || strings.Contains(dataStr, "From:") {
if !strings.HasSuffix(strings.ToLower(att.Filename), ".eml") {
att.Filename += ".eml"
}
} else {
// Try to decode as Base64
// Clean up the base64 string: remove newlines and spaces
base64Str := strings.Map(func(r rune) rune {
if r == '\r' || r == '\n' || r == ' ' || r == '\t' {
return -1
}
return r
}, dataStr)
// Try standard decoding
decoded, err := base64.StdEncoding.DecodeString(base64Str)
if err != nil {
// Try raw decoding (no padding)
decoded, err = base64.RawStdEncoding.DecodeString(base64Str)
}
if err == nil {
att.Data = decoded
if !strings.HasSuffix(strings.ToLower(att.Filename), ".eml") {
att.Filename += ".eml"
}
} else {
fmt.Println("Failed to decode multipart/signed attachment:", err)
}
}
}
if att.Filename == "" {
att.Filename = "attachment"
}
// Only add if we have data
if len(att.Data) > 0 {
email.Attachments = append(email.Attachments, *att)
}
}
return email, nil
}
func processRootProperty(name string, data []byte, email *EmailData) {
tag := name[12:16]
typ := name[16:20]
strVal := ""
if typ == ptUnicode {
strVal = decodeUTF16(data)
} else if typ == ptString8 {
strVal = string(data)
}
switch tag {
case prSubject:
email.Subject = strVal
case prBody:
if email.Body == "" { // Prefer body if not set
email.Body = strVal
}
case prBodyHTML:
email.Body = strVal // Prefer HTML
case prSenderName:
if email.From == "" {
email.From = strVal
} else {
email.From = fmt.Sprintf("%s <%s>", strVal, email.From)
}
case prSenderEmail:
if email.From == "" {
email.From = strVal
} else if !strings.Contains(email.From, "<") {
email.From = fmt.Sprintf("%s <%s>", email.From, strVal)
}
case prDisplayTo:
// Split by ; or similar if needed, but display string is usually one line
email.To = splitAndTrim(strVal)
case prDisplayCc:
email.Cc = splitAndTrim(strVal)
case prDisplayBcc:
email.Bcc = splitAndTrim(strVal)
case prClientSubmitTime:
// Date logic to be added if struct supports it
}
/*
if tag == prClientSubmitTime && typ == "0040" {
if len(data) >= 8 {
ft := binary.LittleEndian.Uint64(data)
t := time.Date(1601, 1, 1, 0, 0, 0, 0, time.UTC).Add(time.Duration(ft) * 100 * time.Nanosecond)
email.Date = t.Format(time.RFC1123Z)
}
}
*/
}
func processAttachProperty(name string, data []byte, att *EmailAttachment) {
tag := name[12:16]
typ := name[16:20]
strVal := ""
if typ == ptUnicode {
strVal = decodeUTF16(data)
} else if typ == ptString8 {
strVal = string(data)
}
switch tag {
case prAttachLongFilename:
att.Filename = strVal
case prAttachFilename:
if att.Filename == "" {
att.Filename = strVal
}
case prAttachMimeTag:
att.ContentType = strVal
case prAttachData:
att.Data = data
}
}
func decodeUTF16(b []byte) string {
decoder := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM).NewDecoder()
decoded, _, _ := transform.Bytes(decoder, b)
// Remove null terminators if present
return strings.TrimRight(string(decoded), "\x00")
}
func splitAndTrim(s string) []string {
if s == "" {
return nil
}
parts := strings.Split(s, ";")
var res []string
for _, p := range parts {
t := strings.TrimSpace(p)
if t != "" {
res = append(res, t)
}
}
return res
}