using System;
using System.IO;
using System.Linq;
using System.Text;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
public static string GetWordContentByOpenXml(string path, string password)
{
try
{
using (var document = WordprocessingDocument.Open(path, false, new OpenSettings()
{
Password = password
}))
{
if (document.MainDocumentPart?.Document?.Body == null)
return null;
var contentBuilder = new StringBuilder();
var body = document.MainDocumentPart.Document.Body;
ExtractBodyContent(body, contentBuilder);
string contentWithoutHeaderFooter = contentBuilder.ToString();
string content = CleanContent(contentWithoutHeaderFooter);
int index = content.LastIndexOf("限公司第");
if (index > 0)
{
return content.Substring(0, index).Trim();
}
else
{
return content;
}
}
}
catch (Exception ex)
{
LogManager.WriteError("GetWordContentByOpenXml()", ex.StackTrace?.ToString());
return null;
}
}
private static void ExtractBodyContent(Body body, StringBuilder contentBuilder)
{
foreach (var element in body.Elements())
{
ExtractElementContent(element, contentBuilder);
}
}
private static void ExtractElementContent(OpenXmlElement element, StringBuilder contentBuilder)
{
switch (element)
{
case Paragraph paragraph:
ExtractParagraphContent(paragraph, contentBuilder);
contentBuilder.AppendLine();
break;
case Table table:
ExtractTableContent(table, contentBuilder);
break;
case SectionProperties _:
break;
default:
foreach (var childElement in element.Elements())
{
ExtractElementContent(childElement, contentBuilder);
}
break;
}
}
private static void ExtractParagraphContent(Paragraph paragraph, StringBuilder contentBuilder)
{
foreach (var run in paragraph.Elements<Run>())
{
foreach (var text in run.Elements<Text>())
{
contentBuilder.Append(text.Text);
}
foreach (var tab in run.Elements<TabChar>())
{
contentBuilder.Append("\t");
}
foreach (var br in run.Elements<Break>())
{
contentBuilder.AppendLine();
}
}
}
private static void ExtractTableContent(Table table, StringBuilder contentBuilder)
{
foreach (var row in table.Elements<TableRow>())
{
foreach (var cell in row.Elements<TableCell>())
{
foreach (var paragraph in cell.Elements<Paragraph>())
{
ExtractParagraphContent(paragraph, contentBuilder);
}
contentBuilder.Append("\t");
}
contentBuilder.AppendLine();
}
}
private static string CleanContent(string content)
{
if (string.IsNullOrEmpty(content))
return string.Empty;
content = System.Text.RegularExpressions.Regex.Replace(content, @"\s+", " ");
content = content.Trim();
content = System.Text.RegularExpressions.Regex.Replace(content, @"\n\s*\n", "\n");
content = content.Replace("EvaluationOnly.CreatedwithAspose.Words.Copyright2003-2024AsposePtyLtd.", "");
content = System.Text.RegularExpressions.Regex.Replace(content, @"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "");
return content.Trim();
}
public static bool IsPasswordRequired(string path)
{
try
{
using (var document = WordprocessingDocument.Open(path, false))
{
return false;
}
}
catch (OpenXmlPackageException ex)
{
return ex.Message.Contains("password") || ex.Message.Contains("encrypted") || ex.Message.Contains("protected");
}
catch
{
return true;
}
}
public static string GetWordContentByOpenXmlAdvanced(string path, string password, bool includeHyperlinks = false, bool includeFootnotes = false)
{
try
{
using (var document = WordprocessingDocument.Open(path, false, new OpenSettings()
{
Password = password
}))
{
if (document.MainDocumentPart?.Document?.Body == null)
return null;
var contentBuilder = new StringBuilder();
var body = document.MainDocumentPart.Document.Body;
ExtractBodyContentAdvanced(body, contentBuilder, includeHyperlinks);
if (includeFootnotes && document.MainDocumentPart.FootnotesPart != null)
{
ExtractFootnotesContent(document.MainDocumentPart.FootnotesPart, contentBuilder);
}
string contentWithoutHeaderFooter = contentBuilder.ToString();
string content = CleanContent(contentWithoutHeaderFooter);
int index = content.LastIndexOf("公司第");
if (index > 0)
{
return content.Substring(0, index).Trim();
}
else
{
return content;
}
}
}
catch (Exception ex)
{
LogManager.WriteError("GetWordContentByOpenXmlAdvanced()", ex.StackTrace?.ToString());
return null;
}
}
private static void ExtractBodyContentAdvanced(Body body, StringBuilder contentBuilder, bool includeHyperlinks)
{
foreach (var element in body.Elements())
{
if (element is Paragraph paragraph)
{
ExtractParagraphContentAdvanced(paragraph, contentBuilder, includeHyperlinks);
contentBuilder.AppendLine();
}
else if (element is Table table)
{
ExtractTableContentAdvanced(table, contentBuilder, includeHyperlinks);
}
else if (!(element is SectionProperties))
{
foreach (var childElement in element.Elements())
{
ExtractBodyContentAdvanced(new Body(childElement), contentBuilder, includeHyperlinks);
}
}
}
}
private static void ExtractParagraphContentAdvanced(Paragraph paragraph, StringBuilder contentBuilder, bool includeHyperlinks)
{
foreach (var element in paragraph.Elements())
{
if (element is Run run)
{
foreach (var text in run.Elements<Text>())
{
contentBuilder.Append(text.Text);
}
}
else if (element is Hyperlink hyperlink && includeHyperlinks)
{
foreach (var run2 in hyperlink.Elements<Run>())
{
foreach (var text in run2.Elements<Text>())
{
contentBuilder.Append(text.Text);
}
}
}
}
}
private static void ExtractTableContentAdvanced(Table table, StringBuilder contentBuilder, bool includeHyperlinks)
{
foreach (var row in table.Elements<TableRow>())
{
foreach (var cell in row.Elements<TableCell>())
{
foreach (var paragraph in cell.Elements<Paragraph>())
{
ExtractParagraphContentAdvanced(paragraph, contentBuilder, includeHyperlinks);
}
contentBuilder.Append("\t");
}
contentBuilder.AppendLine();
}
}
private static void ExtractFootnotesContent(FootnotesPart footnotesPart, StringBuilder contentBuilder)
{
if (footnotesPart.Footnotes != null)
{
contentBuilder.AppendLine("\n--- 脚注 ---");
foreach (var footnote in footnotesPart.Footnotes.Elements<Footnote>())
{
foreach (var paragraph in footnote.Elements<Paragraph>())
{
ExtractParagraphContent(paragraph, contentBuilder);
contentBuilder.AppendLine();
}
}
}
}