Add TNEF handling and email loading improvements
- Implement TNEF extraction and recursive parsing in new `tnef_reader.go` and associated tests. - Create tests for TNEF extraction scenarios in `tnef_diag_test.go`, `tnef_diag7_test.go`, and `tnef_diag8_test.go`.
This commit is contained in:
444
backend/utils/mail/tnef_reader.go
Normal file
444
backend/utils/mail/tnef_reader.go
Normal file
@@ -0,0 +1,444 @@
|
||||
package internal
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"mime"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/teamwork/tnef"
|
||||
)
|
||||
|
||||
// tnefMagic is the TNEF file signature (little-endian 0x223E9F78).
|
||||
var tnefMagic = []byte{0x78, 0x9F, 0x3E, 0x22}
|
||||
|
||||
const maxTNEFDepth = 10
|
||||
|
||||
// isTNEFData returns true if the given byte slice starts with the TNEF magic number.
|
||||
func isTNEFData(data []byte) bool {
|
||||
return len(data) >= 4 &&
|
||||
data[0] == tnefMagic[0] &&
|
||||
data[1] == tnefMagic[1] &&
|
||||
data[2] == tnefMagic[2] &&
|
||||
data[3] == tnefMagic[3]
|
||||
}
|
||||
|
||||
// isTNEFAttachment returns true if an attachment is a TNEF-encoded winmail.dat.
|
||||
// Detection is based on filename, content-type, or the TNEF magic bytes.
|
||||
func isTNEFAttachment(att EmailAttachment) bool {
|
||||
filenameLower := strings.ToLower(att.Filename)
|
||||
if filenameLower == "winmail.dat" {
|
||||
return true
|
||||
}
|
||||
ctLower := strings.ToLower(att.ContentType)
|
||||
if strings.Contains(ctLower, "application/ms-tnef") ||
|
||||
strings.Contains(ctLower, "application/vnd.ms-tnef") {
|
||||
return true
|
||||
}
|
||||
return isTNEFData(att.Data)
|
||||
}
|
||||
|
||||
// extractTNEFAttachments decodes a TNEF blob and returns the files embedded
|
||||
// inside it, recursively following nested embedded MAPI messages.
|
||||
func extractTNEFAttachments(data []byte) ([]EmailAttachment, error) {
|
||||
return extractTNEFRecursive(data, 0)
|
||||
}
|
||||
|
||||
func extractTNEFRecursive(data []byte, depth int) ([]EmailAttachment, error) {
|
||||
if depth > maxTNEFDepth {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
decoded, err := tnef.Decode(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var attachments []EmailAttachment
|
||||
|
||||
// Collect non-placeholder file attachments from the library output.
|
||||
for _, att := range decoded.Attachments {
|
||||
if len(att.Data) == 0 {
|
||||
continue
|
||||
}
|
||||
// Skip the small MAPI placeholder text ("L'allegato è un messaggio
|
||||
// incorporato MAPI 1.0...") that Outlook inserts for embedded messages.
|
||||
if isEmbeddedMsgPlaceholder(att) {
|
||||
continue
|
||||
}
|
||||
|
||||
filename := att.Title
|
||||
if filename == "" || filename == "Untitled Attachment" {
|
||||
filename = inferFilename(att.Data)
|
||||
}
|
||||
|
||||
attachments = append(attachments, EmailAttachment{
|
||||
Filename: filename,
|
||||
ContentType: mimeTypeFromFilename(filename),
|
||||
Data: att.Data,
|
||||
})
|
||||
}
|
||||
|
||||
// Recursively dig into embedded MAPI messages stored in
|
||||
// attAttachment (0x9005) → PR_ATTACH_DATA_OBJ (0x3701).
|
||||
for _, stream := range findEmbeddedTNEFStreamsFromRaw(data) {
|
||||
subAtts, _ := extractTNEFRecursive(stream, depth+1)
|
||||
attachments = append(attachments, subAtts...)
|
||||
}
|
||||
|
||||
return attachments, nil
|
||||
}
|
||||
|
||||
// isEmbeddedMsgPlaceholder returns true if the attachment is a tiny placeholder
|
||||
// that Outlook generates for embedded MAPI messages ("L'allegato è un messaggio
|
||||
// incorporato MAPI 1.0" or equivalent in other languages).
|
||||
func isEmbeddedMsgPlaceholder(att *tnef.Attachment) bool {
|
||||
if len(att.Data) > 300 {
|
||||
return false
|
||||
}
|
||||
lower := strings.ToLower(string(att.Data))
|
||||
return strings.Contains(lower, "mapi 1.0") ||
|
||||
strings.Contains(lower, "embedded message") ||
|
||||
strings.Contains(lower, "messaggio incorporato")
|
||||
}
|
||||
|
||||
// inferFilename picks a reasonable filename based on the data's magic bytes.
|
||||
func inferFilename(data []byte) string {
|
||||
if looksLikeEML(data) {
|
||||
return "embedded_message.eml"
|
||||
}
|
||||
if isTNEFData(data) {
|
||||
return "embedded.dat"
|
||||
}
|
||||
if len(data) >= 8 {
|
||||
if data[0] == 0xD0 && data[1] == 0xCF && data[2] == 0x11 && data[3] == 0xE0 {
|
||||
return "embedded_message.msg"
|
||||
}
|
||||
}
|
||||
return "attachment.dat"
|
||||
}
|
||||
|
||||
// looksLikeEML returns true if data starts with typical RFC 5322 headers.
|
||||
func looksLikeEML(data []byte) bool {
|
||||
if len(data) < 20 {
|
||||
return false
|
||||
}
|
||||
// Quick check: must start with printable ASCII
|
||||
if data[0] < 32 || data[0] > 126 {
|
||||
return false
|
||||
}
|
||||
prefix := strings.ToLower(string(data[:min(200, len(data))]))
|
||||
return strings.HasPrefix(prefix, "mime-version:") ||
|
||||
strings.HasPrefix(prefix, "from:") ||
|
||||
strings.HasPrefix(prefix, "received:") ||
|
||||
strings.HasPrefix(prefix, "date:") ||
|
||||
strings.HasPrefix(prefix, "content-type:") ||
|
||||
strings.HasPrefix(prefix, "return-path:")
|
||||
}
|
||||
|
||||
// expandTNEFAttachments iterates over the attachment list and replaces any
|
||||
// TNEF-encoded winmail.dat entries with the files they contain. Attachments
|
||||
// that are not TNEF are passed through unchanged.
|
||||
func expandTNEFAttachments(attachments []EmailAttachment) []EmailAttachment {
|
||||
var result []EmailAttachment
|
||||
for _, att := range attachments {
|
||||
if isTNEFAttachment(att) {
|
||||
extracted, err := extractTNEFAttachments(att.Data)
|
||||
if err == nil && len(extracted) > 0 {
|
||||
result = append(result, extracted...)
|
||||
continue
|
||||
}
|
||||
// If extraction fails, keep the original blob.
|
||||
}
|
||||
result = append(result, att)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Raw TNEF attribute scanner — extracts nested TNEF streams from embedded
|
||||
// MAPI messages that the teamwork/tnef library does not handle.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// findEmbeddedTNEFStreamsFromRaw scans the raw TNEF byte stream for
|
||||
// attAttachment (0x00069005) attribute blocks, parses their MAPI properties,
|
||||
// and extracts any PR_ATTACH_DATA_OBJ (0x3701) values that begin with a
|
||||
// TNEF signature.
|
||||
func findEmbeddedTNEFStreamsFromRaw(tnefData []byte) [][]byte {
|
||||
if len(tnefData) < 6 || !isTNEFData(tnefData) {
|
||||
return nil
|
||||
}
|
||||
|
||||
var streams [][]byte
|
||||
offset := 6 // skip TNEF signature (4) + key (2)
|
||||
|
||||
for offset+9 < len(tnefData) {
|
||||
level := tnefData[offset]
|
||||
attrID := binary.LittleEndian.Uint32(tnefData[offset+1 : offset+5])
|
||||
attrLen := int(binary.LittleEndian.Uint32(tnefData[offset+5 : offset+9]))
|
||||
dataStart := offset + 9
|
||||
|
||||
if dataStart+attrLen > len(tnefData) || attrLen < 0 {
|
||||
break
|
||||
}
|
||||
|
||||
// attAttachment (0x00069005) at attachment level (0x02)
|
||||
if level == 0x02 && attrID == 0x00069005 && attrLen > 100 {
|
||||
mapiData := tnefData[dataStart : dataStart+attrLen]
|
||||
embedded := extractPRAttachDataObjFromMAPI(mapiData)
|
||||
if embedded != nil && len(embedded) > 22 {
|
||||
// Skip the 16-byte IID_IMessage GUID
|
||||
afterGuid := embedded[16:]
|
||||
if isTNEFData(afterGuid) {
|
||||
streams = append(streams, afterGuid)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// level(1) + id(4) + len(4) + data(attrLen) + checksum(2)
|
||||
offset += 9 + attrLen + 2
|
||||
}
|
||||
return streams
|
||||
}
|
||||
|
||||
// extractPRAttachDataObjFromMAPI parses a MAPI properties block (from an
|
||||
// attAttachment attribute) and returns the raw value of PR_ATTACH_DATA_OBJ
|
||||
// (property ID 0x3701, type PT_OBJECT 0x000D).
|
||||
func extractPRAttachDataObjFromMAPI(data []byte) []byte {
|
||||
if len(data) < 4 {
|
||||
return nil
|
||||
}
|
||||
count := int(binary.LittleEndian.Uint32(data[0:4]))
|
||||
off := 4
|
||||
|
||||
for i := 0; i < count && off+4 <= len(data); i++ {
|
||||
propTag := binary.LittleEndian.Uint32(data[off : off+4])
|
||||
propType := propTag & 0xFFFF
|
||||
propID := (propTag >> 16) & 0xFFFF
|
||||
off += 4
|
||||
|
||||
// Named properties (ID >= 0x8000) have extra GUID + kind fields.
|
||||
if propID >= 0x8000 {
|
||||
if off+20 > len(data) {
|
||||
return nil
|
||||
}
|
||||
kind := binary.LittleEndian.Uint32(data[off+16 : off+20])
|
||||
off += 20
|
||||
if kind == 0 { // MNID_ID
|
||||
off += 4
|
||||
} else { // MNID_STRING
|
||||
if off+4 > len(data) {
|
||||
return nil
|
||||
}
|
||||
nameLen := int(binary.LittleEndian.Uint32(data[off : off+4]))
|
||||
off += 4 + nameLen
|
||||
off += padTo4(nameLen)
|
||||
}
|
||||
}
|
||||
|
||||
off = skipMAPIPropValue(data, off, propType, propID)
|
||||
if off < 0 {
|
||||
return nil // parse error
|
||||
}
|
||||
// If skipMAPIPropValue returned a special sentinel, extract it.
|
||||
// We use a hack: skipMAPIPropValue can't return the data directly,
|
||||
// so we handle PT_OBJECT / 0x3701 inline below.
|
||||
}
|
||||
|
||||
// Simpler approach: re-scan specifically for 0x3701.
|
||||
return extractPRAttachDataObjDirect(data)
|
||||
}
|
||||
|
||||
// extractPRAttachDataObjDirect re-scans the MAPI property block and
|
||||
// returns the raw value of PR_ATTACH_DATA_OBJ (0x3701, PT_OBJECT).
|
||||
func extractPRAttachDataObjDirect(data []byte) []byte {
|
||||
if len(data) < 4 {
|
||||
return nil
|
||||
}
|
||||
count := int(binary.LittleEndian.Uint32(data[0:4]))
|
||||
off := 4
|
||||
|
||||
for i := 0; i < count && off+4 <= len(data); i++ {
|
||||
propTag := binary.LittleEndian.Uint32(data[off : off+4])
|
||||
propType := propTag & 0xFFFF
|
||||
propID := (propTag >> 16) & 0xFFFF
|
||||
off += 4
|
||||
|
||||
// Skip named property headers.
|
||||
if propID >= 0x8000 {
|
||||
if off+20 > len(data) {
|
||||
return nil
|
||||
}
|
||||
kind := binary.LittleEndian.Uint32(data[off+16 : off+20])
|
||||
off += 20
|
||||
if kind == 0 {
|
||||
off += 4
|
||||
} else {
|
||||
if off+4 > len(data) {
|
||||
return nil
|
||||
}
|
||||
nameLen := int(binary.LittleEndian.Uint32(data[off : off+4]))
|
||||
off += 4 + nameLen
|
||||
off += padTo4(nameLen)
|
||||
}
|
||||
}
|
||||
|
||||
switch propType {
|
||||
case 0x0002: // PT_SHORT (padded to 4)
|
||||
off += 4
|
||||
case 0x0003, 0x000A: // PT_LONG, PT_ERROR
|
||||
off += 4
|
||||
case 0x000B: // PT_BOOLEAN (padded to 4)
|
||||
off += 4
|
||||
case 0x0004: // PT_FLOAT
|
||||
off += 4
|
||||
case 0x0005: // PT_DOUBLE
|
||||
off += 8
|
||||
case 0x0006: // PT_CURRENCY
|
||||
off += 8
|
||||
case 0x0007: // PT_APPTIME
|
||||
off += 8
|
||||
case 0x0014: // PT_I8
|
||||
off += 8
|
||||
case 0x0040: // PT_SYSTIME
|
||||
off += 8
|
||||
case 0x0048: // PT_CLSID
|
||||
off += 16
|
||||
case 0x001E, 0x001F: // PT_STRING8, PT_UNICODE
|
||||
off = skipCountedBlobs(data, off)
|
||||
case 0x0102: // PT_BINARY
|
||||
off = skipCountedBlobs(data, off)
|
||||
case 0x000D: // PT_OBJECT
|
||||
if off+4 > len(data) {
|
||||
return nil
|
||||
}
|
||||
cnt := int(binary.LittleEndian.Uint32(data[off : off+4]))
|
||||
off += 4
|
||||
for j := 0; j < cnt; j++ {
|
||||
if off+4 > len(data) {
|
||||
return nil
|
||||
}
|
||||
olen := int(binary.LittleEndian.Uint32(data[off : off+4]))
|
||||
off += 4
|
||||
if propID == 0x3701 && off+olen <= len(data) {
|
||||
return data[off : off+olen]
|
||||
}
|
||||
off += olen
|
||||
off += padTo4(olen)
|
||||
}
|
||||
case 0x1002: // PT_MV_SHORT
|
||||
off = skipMVFixed(data, off, 4)
|
||||
case 0x1003: // PT_MV_LONG
|
||||
off = skipMVFixed(data, off, 4)
|
||||
case 0x1005: // PT_MV_DOUBLE
|
||||
off = skipMVFixed(data, off, 8)
|
||||
case 0x1014: // PT_MV_I8
|
||||
off = skipMVFixed(data, off, 8)
|
||||
case 0x1040: // PT_MV_SYSTIME
|
||||
off = skipMVFixed(data, off, 8)
|
||||
case 0x101E, 0x101F: // PT_MV_STRING8, PT_MV_UNICODE
|
||||
off = skipCountedBlobs(data, off)
|
||||
case 0x1048: // PT_MV_CLSID
|
||||
off = skipMVFixed(data, off, 16)
|
||||
case 0x1102: // PT_MV_BINARY
|
||||
off = skipCountedBlobs(data, off)
|
||||
default:
|
||||
// Unknown type, can't continue
|
||||
return nil
|
||||
}
|
||||
|
||||
if off < 0 || off > len(data) {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// skipCountedBlobs advances past a MAPI value that stores count + N
|
||||
// length-prefixed blobs (used by PT_STRING8, PT_UNICODE, PT_BINARY, and
|
||||
// their multi-valued variants).
|
||||
func skipCountedBlobs(data []byte, off int) int {
|
||||
if off+4 > len(data) {
|
||||
return -1
|
||||
}
|
||||
cnt := int(binary.LittleEndian.Uint32(data[off : off+4]))
|
||||
off += 4
|
||||
for j := 0; j < cnt; j++ {
|
||||
if off+4 > len(data) {
|
||||
return -1
|
||||
}
|
||||
blen := int(binary.LittleEndian.Uint32(data[off : off+4]))
|
||||
off += 4 + blen
|
||||
off += padTo4(blen)
|
||||
}
|
||||
return off
|
||||
}
|
||||
|
||||
// skipMVFixed advances past a multi-valued fixed-size property
|
||||
// (count followed by count*elemSize bytes).
|
||||
func skipMVFixed(data []byte, off int, elemSize int) int {
|
||||
if off+4 > len(data) {
|
||||
return -1
|
||||
}
|
||||
cnt := int(binary.LittleEndian.Uint32(data[off : off+4]))
|
||||
off += 4 + cnt*elemSize
|
||||
return off
|
||||
}
|
||||
|
||||
// skipMAPIPropValue is a generic value skipper (unused in the current flow
|
||||
// but kept for completeness).
|
||||
func skipMAPIPropValue(data []byte, off int, propType uint32, _ uint32) int {
|
||||
switch propType {
|
||||
case 0x0002:
|
||||
return off + 4
|
||||
case 0x0003, 0x000A, 0x000B, 0x0004:
|
||||
return off + 4
|
||||
case 0x0005, 0x0006, 0x0007, 0x0014, 0x0040:
|
||||
return off + 8
|
||||
case 0x0048:
|
||||
return off + 16
|
||||
case 0x001E, 0x001F, 0x0102, 0x000D:
|
||||
return skipCountedBlobs(data, off)
|
||||
case 0x1002, 0x1003:
|
||||
return skipMVFixed(data, off, 4)
|
||||
case 0x1005, 0x1014, 0x1040:
|
||||
return skipMVFixed(data, off, 8)
|
||||
case 0x1048:
|
||||
return skipMVFixed(data, off, 16)
|
||||
case 0x101E, 0x101F, 0x1102:
|
||||
return skipCountedBlobs(data, off)
|
||||
default:
|
||||
return -1
|
||||
}
|
||||
}
|
||||
|
||||
// padTo4 returns the number of padding bytes needed to reach a 4-byte boundary.
|
||||
func padTo4(n int) int {
|
||||
r := n % 4
|
||||
if r == 0 {
|
||||
return 0
|
||||
}
|
||||
return 4 - r
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// MIME type helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// mimeTypeFromFilename guesses the MIME type from a file extension.
|
||||
// Falls back to "application/octet-stream" when the type is unknown.
|
||||
func mimeTypeFromFilename(filename string) string {
|
||||
ext := strings.ToLower(filepath.Ext(filename))
|
||||
if ext == "" {
|
||||
return "application/octet-stream"
|
||||
}
|
||||
t := mime.TypeByExtension(ext)
|
||||
if t == "" {
|
||||
return "application/octet-stream"
|
||||
}
|
||||
// Strip any parameters (e.g. "; charset=utf-8")
|
||||
if idx := strings.Index(t, ";"); idx != -1 {
|
||||
t = strings.TrimSpace(t[:idx])
|
||||
}
|
||||
return t
|
||||
}
|
||||
Reference in New Issue
Block a user