Add TNEF handling and email loading improvements

- Implement TNEF extraction and recursive parsing in new `tnef_reader.go` and associated tests.
- Create tests for TNEF extraction scenarios in `tnef_diag_test.go`, `tnef_diag7_test.go`, and `tnef_diag8_test.go`.
This commit is contained in:
Flavio Fois
2026-02-14 09:03:41 +01:00
parent 33cb171fb1
commit 54a3dff1c2
23 changed files with 2029 additions and 18 deletions

View File

@@ -0,0 +1,444 @@
package internal
import (
"encoding/binary"
"mime"
"path/filepath"
"strings"
"github.com/teamwork/tnef"
)
// tnefMagic is the TNEF file signature (little-endian 0x223E9F78).
var tnefMagic = []byte{0x78, 0x9F, 0x3E, 0x22}
const maxTNEFDepth = 10
// isTNEFData returns true if the given byte slice starts with the TNEF magic number.
func isTNEFData(data []byte) bool {
return len(data) >= 4 &&
data[0] == tnefMagic[0] &&
data[1] == tnefMagic[1] &&
data[2] == tnefMagic[2] &&
data[3] == tnefMagic[3]
}
// isTNEFAttachment returns true if an attachment is a TNEF-encoded winmail.dat.
// Detection is based on filename, content-type, or the TNEF magic bytes.
func isTNEFAttachment(att EmailAttachment) bool {
filenameLower := strings.ToLower(att.Filename)
if filenameLower == "winmail.dat" {
return true
}
ctLower := strings.ToLower(att.ContentType)
if strings.Contains(ctLower, "application/ms-tnef") ||
strings.Contains(ctLower, "application/vnd.ms-tnef") {
return true
}
return isTNEFData(att.Data)
}
// extractTNEFAttachments decodes a TNEF blob and returns the files embedded
// inside it, recursively following nested embedded MAPI messages.
func extractTNEFAttachments(data []byte) ([]EmailAttachment, error) {
return extractTNEFRecursive(data, 0)
}
func extractTNEFRecursive(data []byte, depth int) ([]EmailAttachment, error) {
if depth > maxTNEFDepth {
return nil, nil
}
decoded, err := tnef.Decode(data)
if err != nil {
return nil, err
}
var attachments []EmailAttachment
// Collect non-placeholder file attachments from the library output.
for _, att := range decoded.Attachments {
if len(att.Data) == 0 {
continue
}
// Skip the small MAPI placeholder text ("L'allegato è un messaggio
// incorporato MAPI 1.0...") that Outlook inserts for embedded messages.
if isEmbeddedMsgPlaceholder(att) {
continue
}
filename := att.Title
if filename == "" || filename == "Untitled Attachment" {
filename = inferFilename(att.Data)
}
attachments = append(attachments, EmailAttachment{
Filename: filename,
ContentType: mimeTypeFromFilename(filename),
Data: att.Data,
})
}
// Recursively dig into embedded MAPI messages stored in
// attAttachment (0x9005) → PR_ATTACH_DATA_OBJ (0x3701).
for _, stream := range findEmbeddedTNEFStreamsFromRaw(data) {
subAtts, _ := extractTNEFRecursive(stream, depth+1)
attachments = append(attachments, subAtts...)
}
return attachments, nil
}
// isEmbeddedMsgPlaceholder returns true if the attachment is a tiny placeholder
// that Outlook generates for embedded MAPI messages ("L'allegato è un messaggio
// incorporato MAPI 1.0" or equivalent in other languages).
func isEmbeddedMsgPlaceholder(att *tnef.Attachment) bool {
if len(att.Data) > 300 {
return false
}
lower := strings.ToLower(string(att.Data))
return strings.Contains(lower, "mapi 1.0") ||
strings.Contains(lower, "embedded message") ||
strings.Contains(lower, "messaggio incorporato")
}
// inferFilename picks a reasonable filename based on the data's magic bytes.
func inferFilename(data []byte) string {
if looksLikeEML(data) {
return "embedded_message.eml"
}
if isTNEFData(data) {
return "embedded.dat"
}
if len(data) >= 8 {
if data[0] == 0xD0 && data[1] == 0xCF && data[2] == 0x11 && data[3] == 0xE0 {
return "embedded_message.msg"
}
}
return "attachment.dat"
}
// looksLikeEML returns true if data starts with typical RFC 5322 headers.
func looksLikeEML(data []byte) bool {
if len(data) < 20 {
return false
}
// Quick check: must start with printable ASCII
if data[0] < 32 || data[0] > 126 {
return false
}
prefix := strings.ToLower(string(data[:min(200, len(data))]))
return strings.HasPrefix(prefix, "mime-version:") ||
strings.HasPrefix(prefix, "from:") ||
strings.HasPrefix(prefix, "received:") ||
strings.HasPrefix(prefix, "date:") ||
strings.HasPrefix(prefix, "content-type:") ||
strings.HasPrefix(prefix, "return-path:")
}
// expandTNEFAttachments iterates over the attachment list and replaces any
// TNEF-encoded winmail.dat entries with the files they contain. Attachments
// that are not TNEF are passed through unchanged.
func expandTNEFAttachments(attachments []EmailAttachment) []EmailAttachment {
var result []EmailAttachment
for _, att := range attachments {
if isTNEFAttachment(att) {
extracted, err := extractTNEFAttachments(att.Data)
if err == nil && len(extracted) > 0 {
result = append(result, extracted...)
continue
}
// If extraction fails, keep the original blob.
}
result = append(result, att)
}
return result
}
// ---------------------------------------------------------------------------
// Raw TNEF attribute scanner — extracts nested TNEF streams from embedded
// MAPI messages that the teamwork/tnef library does not handle.
// ---------------------------------------------------------------------------
// findEmbeddedTNEFStreamsFromRaw scans the raw TNEF byte stream for
// attAttachment (0x00069005) attribute blocks, parses their MAPI properties,
// and extracts any PR_ATTACH_DATA_OBJ (0x3701) values that begin with a
// TNEF signature.
func findEmbeddedTNEFStreamsFromRaw(tnefData []byte) [][]byte {
if len(tnefData) < 6 || !isTNEFData(tnefData) {
return nil
}
var streams [][]byte
offset := 6 // skip TNEF signature (4) + key (2)
for offset+9 < len(tnefData) {
level := tnefData[offset]
attrID := binary.LittleEndian.Uint32(tnefData[offset+1 : offset+5])
attrLen := int(binary.LittleEndian.Uint32(tnefData[offset+5 : offset+9]))
dataStart := offset + 9
if dataStart+attrLen > len(tnefData) || attrLen < 0 {
break
}
// attAttachment (0x00069005) at attachment level (0x02)
if level == 0x02 && attrID == 0x00069005 && attrLen > 100 {
mapiData := tnefData[dataStart : dataStart+attrLen]
embedded := extractPRAttachDataObjFromMAPI(mapiData)
if embedded != nil && len(embedded) > 22 {
// Skip the 16-byte IID_IMessage GUID
afterGuid := embedded[16:]
if isTNEFData(afterGuid) {
streams = append(streams, afterGuid)
}
}
}
// level(1) + id(4) + len(4) + data(attrLen) + checksum(2)
offset += 9 + attrLen + 2
}
return streams
}
// extractPRAttachDataObjFromMAPI parses a MAPI properties block (from an
// attAttachment attribute) and returns the raw value of PR_ATTACH_DATA_OBJ
// (property ID 0x3701, type PT_OBJECT 0x000D).
func extractPRAttachDataObjFromMAPI(data []byte) []byte {
if len(data) < 4 {
return nil
}
count := int(binary.LittleEndian.Uint32(data[0:4]))
off := 4
for i := 0; i < count && off+4 <= len(data); i++ {
propTag := binary.LittleEndian.Uint32(data[off : off+4])
propType := propTag & 0xFFFF
propID := (propTag >> 16) & 0xFFFF
off += 4
// Named properties (ID >= 0x8000) have extra GUID + kind fields.
if propID >= 0x8000 {
if off+20 > len(data) {
return nil
}
kind := binary.LittleEndian.Uint32(data[off+16 : off+20])
off += 20
if kind == 0 { // MNID_ID
off += 4
} else { // MNID_STRING
if off+4 > len(data) {
return nil
}
nameLen := int(binary.LittleEndian.Uint32(data[off : off+4]))
off += 4 + nameLen
off += padTo4(nameLen)
}
}
off = skipMAPIPropValue(data, off, propType, propID)
if off < 0 {
return nil // parse error
}
// If skipMAPIPropValue returned a special sentinel, extract it.
// We use a hack: skipMAPIPropValue can't return the data directly,
// so we handle PT_OBJECT / 0x3701 inline below.
}
// Simpler approach: re-scan specifically for 0x3701.
return extractPRAttachDataObjDirect(data)
}
// extractPRAttachDataObjDirect re-scans the MAPI property block and
// returns the raw value of PR_ATTACH_DATA_OBJ (0x3701, PT_OBJECT).
func extractPRAttachDataObjDirect(data []byte) []byte {
if len(data) < 4 {
return nil
}
count := int(binary.LittleEndian.Uint32(data[0:4]))
off := 4
for i := 0; i < count && off+4 <= len(data); i++ {
propTag := binary.LittleEndian.Uint32(data[off : off+4])
propType := propTag & 0xFFFF
propID := (propTag >> 16) & 0xFFFF
off += 4
// Skip named property headers.
if propID >= 0x8000 {
if off+20 > len(data) {
return nil
}
kind := binary.LittleEndian.Uint32(data[off+16 : off+20])
off += 20
if kind == 0 {
off += 4
} else {
if off+4 > len(data) {
return nil
}
nameLen := int(binary.LittleEndian.Uint32(data[off : off+4]))
off += 4 + nameLen
off += padTo4(nameLen)
}
}
switch propType {
case 0x0002: // PT_SHORT (padded to 4)
off += 4
case 0x0003, 0x000A: // PT_LONG, PT_ERROR
off += 4
case 0x000B: // PT_BOOLEAN (padded to 4)
off += 4
case 0x0004: // PT_FLOAT
off += 4
case 0x0005: // PT_DOUBLE
off += 8
case 0x0006: // PT_CURRENCY
off += 8
case 0x0007: // PT_APPTIME
off += 8
case 0x0014: // PT_I8
off += 8
case 0x0040: // PT_SYSTIME
off += 8
case 0x0048: // PT_CLSID
off += 16
case 0x001E, 0x001F: // PT_STRING8, PT_UNICODE
off = skipCountedBlobs(data, off)
case 0x0102: // PT_BINARY
off = skipCountedBlobs(data, off)
case 0x000D: // PT_OBJECT
if off+4 > len(data) {
return nil
}
cnt := int(binary.LittleEndian.Uint32(data[off : off+4]))
off += 4
for j := 0; j < cnt; j++ {
if off+4 > len(data) {
return nil
}
olen := int(binary.LittleEndian.Uint32(data[off : off+4]))
off += 4
if propID == 0x3701 && off+olen <= len(data) {
return data[off : off+olen]
}
off += olen
off += padTo4(olen)
}
case 0x1002: // PT_MV_SHORT
off = skipMVFixed(data, off, 4)
case 0x1003: // PT_MV_LONG
off = skipMVFixed(data, off, 4)
case 0x1005: // PT_MV_DOUBLE
off = skipMVFixed(data, off, 8)
case 0x1014: // PT_MV_I8
off = skipMVFixed(data, off, 8)
case 0x1040: // PT_MV_SYSTIME
off = skipMVFixed(data, off, 8)
case 0x101E, 0x101F: // PT_MV_STRING8, PT_MV_UNICODE
off = skipCountedBlobs(data, off)
case 0x1048: // PT_MV_CLSID
off = skipMVFixed(data, off, 16)
case 0x1102: // PT_MV_BINARY
off = skipCountedBlobs(data, off)
default:
// Unknown type, can't continue
return nil
}
if off < 0 || off > len(data) {
return nil
}
}
return nil
}
// skipCountedBlobs advances past a MAPI value that stores count + N
// length-prefixed blobs (used by PT_STRING8, PT_UNICODE, PT_BINARY, and
// their multi-valued variants).
func skipCountedBlobs(data []byte, off int) int {
if off+4 > len(data) {
return -1
}
cnt := int(binary.LittleEndian.Uint32(data[off : off+4]))
off += 4
for j := 0; j < cnt; j++ {
if off+4 > len(data) {
return -1
}
blen := int(binary.LittleEndian.Uint32(data[off : off+4]))
off += 4 + blen
off += padTo4(blen)
}
return off
}
// skipMVFixed advances past a multi-valued fixed-size property
// (count followed by count*elemSize bytes).
func skipMVFixed(data []byte, off int, elemSize int) int {
if off+4 > len(data) {
return -1
}
cnt := int(binary.LittleEndian.Uint32(data[off : off+4]))
off += 4 + cnt*elemSize
return off
}
// skipMAPIPropValue is a generic value skipper (unused in the current flow
// but kept for completeness).
func skipMAPIPropValue(data []byte, off int, propType uint32, _ uint32) int {
switch propType {
case 0x0002:
return off + 4
case 0x0003, 0x000A, 0x000B, 0x0004:
return off + 4
case 0x0005, 0x0006, 0x0007, 0x0014, 0x0040:
return off + 8
case 0x0048:
return off + 16
case 0x001E, 0x001F, 0x0102, 0x000D:
return skipCountedBlobs(data, off)
case 0x1002, 0x1003:
return skipMVFixed(data, off, 4)
case 0x1005, 0x1014, 0x1040:
return skipMVFixed(data, off, 8)
case 0x1048:
return skipMVFixed(data, off, 16)
case 0x101E, 0x101F, 0x1102:
return skipCountedBlobs(data, off)
default:
return -1
}
}
// padTo4 returns the number of padding bytes needed to reach a 4-byte boundary.
func padTo4(n int) int {
r := n % 4
if r == 0 {
return 0
}
return 4 - r
}
// ---------------------------------------------------------------------------
// MIME type helper
// ---------------------------------------------------------------------------
// mimeTypeFromFilename guesses the MIME type from a file extension.
// Falls back to "application/octet-stream" when the type is unknown.
func mimeTypeFromFilename(filename string) string {
ext := strings.ToLower(filepath.Ext(filename))
if ext == "" {
return "application/octet-stream"
}
t := mime.TypeByExtension(ext)
if t == "" {
return "application/octet-stream"
}
// Strip any parameters (e.g. "; charset=utf-8")
if idx := strings.Index(t, ";"); idx != -1 {
t = strings.TrimSpace(t[:idx])
}
return t
}