-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIsdocAttachmentExtractor.cs
More file actions
78 lines (63 loc) · 2.33 KB
/
IsdocAttachmentExtractor.cs
File metadata and controls
78 lines (63 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
using System.Text;
using iText.Kernel.Pdf;
namespace MKTestAzureFunction;
public sealed class IsdocAttachmentExtractor
{
public string ExtractIsdocXml(Stream pdfStream)
{
using var reader = new PdfReader(pdfStream);
using var pdfDocument = new PdfDocument(reader);
foreach (var attachment in GetEmbeddedFiles(pdfDocument))
{
if (!LooksLikeIsdocAttachment(attachment.Name, attachment.Content))
{
continue;
}
return DecodeXml(attachment.Content);
}
throw new InvalidOperationException("No ISDOC XML attachment found in the PDF file.");
}
private static IEnumerable<(string Name, byte[] Content)> GetEmbeddedFiles(PdfDocument pdfDocument)
{
var nameTree = pdfDocument.GetCatalog().GetNameTree(PdfName.EmbeddedFiles);
var files = nameTree?.GetNames();
if (files is null)
{
yield break;
}
foreach (var entry in files)
{
var fileSpec = entry.Value as PdfDictionary
?? (entry.Value as PdfIndirectReference)?.GetRefersTo() as PdfDictionary;
var embeddedFiles = fileSpec?.GetAsDictionary(PdfName.EF);
var stream = embeddedFiles?.GetAsStream(PdfName.UF) ?? embeddedFiles?.GetAsStream(PdfName.F);
if (stream is null)
{
continue;
}
var content = stream.GetBytes();
if (content.Length == 0)
{
continue;
}
var fileName = entry.Key?.ToUnicodeString() ?? string.Empty;
yield return (fileName, content);
}
}
private static bool LooksLikeIsdocAttachment(string fileName, byte[] content)
{
if (fileName.EndsWith(".isdoc", StringComparison.OrdinalIgnoreCase) ||
fileName.EndsWith(".xml", StringComparison.OrdinalIgnoreCase))
{
return true;
}
var payload = DecodeXml(content).TrimStart();
return payload.StartsWith("<", StringComparison.Ordinal);
}
private static string DecodeXml(byte[] content)
{
using var stream = new MemoryStream(content);
using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true);
return reader.ReadToEnd();
}
}