PDF解析帮助类

public class ComPDFHepler
{
/// <summary>
/// 正则获取字符串中两个字符串间的内容
/// </summary>
/// <param name="str"></param>
/// <param name="s"></param>
/// <param name="e"></param>
/// <returns></returns>
public static string GetValue(string str, string s, string e, bool isContansE)
{
Regex rg = new Regex("(?<=(" + s + "))[.\s\S]*?(?=(" + e + "))", RegexOptions.Multiline | RegexOptions.Singleline);
Match matchs = rg.Match(str);
if (isContansE)
{
return matchs.Groups[0].Value + matchs.Groups[2].Value;
}
else
{
return matchs.Groups[0].Value;
}

}

public static string GetValue(string str, string s, string e)
{
Regex rg = new Regex("(?<=(" + s + "))[.\s\S]*?(?=(" + e + "))", RegexOptions.Multiline | RegexOptions.Singleline);
Match matchs = rg.Match(str);

return matchs.Groups[0].Value + matchs.Groups[2].Value;
}
/// <summary>
/// 以startString起,取值到结束
/// </summary>
/// <param name="str"></param>
/// <param name="s"></param>
/// <returns></returns>
public static string GetStartWithValue(string str, string startString)
{
Regex rg = new Regex("(?<=(" + startString + "))[.\s\S]*", RegexOptions.Multiline | RegexOptions.Singleline);
Match matchs = rg.Match(str);
return matchs.Value;
}
/// <summary>
/// 转换成列表
/// </summary>
/// <param name="tempContent"></param>
/// <returns></returns>
public static List<String> ConvertToArrayString(string tempContent)
{

List<String> list = new List<string>();


StringReader sr = new StringReader(tempContent);

StringBuilder sb = new StringBuilder();

//读第一行

var strContent = sr.ReadLine();

Console.WriteLine(strContent);
//循环读所有的内容

while ((strContent = sr.ReadLine()) != null)
{
if (strContent.Equals(" "))
{
continue;
}

bool isMatch = Regex.IsMatch(strContent, @"(^d+.)20[0-9][0-9]");
if (isMatch)
{
if (!string.IsNullOrEmpty(sb.ToString()))
{
list.Add(sb.ToString());

sb = new StringBuilder();
}

}


sb.AppendLine(strContent);

}

sr.Close();

if (!string.IsNullOrEmpty(sb.ToString()))
{
list.Add(sb.ToString());
}


return list;

}

/// <summary>
/// 获取需要分析的(一)贷款 内容
/// </summary>
/// <param name="tagPath"></param>
/// <returns></returns>
public static string GetAnalysisContent(string tagPath)
{

StringReader sr = new StringReader(tagPath);

StringBuilder sb = new StringBuilder();
bool canRead = false;
//读第一行

var strContent = sr.ReadLine();

Console.WriteLine(strContent);
//循环读所有的内容

while ((strContent = sr.ReadLine()) != null)
{
//读到“信 贷 交 易 信 息 明 细 信 贷 交 易 信 息 明 细”,则是需要分析的文件,将其放在临时变量中sb,
//一直读到“( 二 ) 贷 记 卡 ( 二 ) 贷 记 卡”。结束。
if (canRead)
{
if (Regex.IsMatch(strContent, "(()[一二三四五六七八九十]())") && !strContent.Contains("贷款"))
{
canRead = false;
}
else
{
sb.AppendLine(strContent);
}

}
else
{
canRead = strContent.Contains("信贷交易信息明细");
}

}

sr.Close();

return sb.ToString();

}


/// <summary>
/// 获取需要分析的(二)贷记卡 内容
/// </summary>
/// <param name="tagPath"></param>
/// <returns></returns>
public static string GetAnyTextCardInfo(string tagPath)
{

string text = ComPDFHepler.GetValue(tagPath, "信贷交易信息明细", "查询记录", false);
text = ComPDFHepler.GetStartWithValue(text, "贷记卡");
return text;


}

/// <summary>
/// 读取PDF文件
/// </summary>
/// <param name="file"></param>
/// <returns></returns>
public static string pdf2txt(FileInfo file)
{

PDDocument doc = PDDocument.load(file.FullName);

PDFTextStripper pdfStripper = new PDFTextStripper();

string text = pdfStripper.getText(doc);


doc.close();

return text;

}

/// <summary>
/// 是否是数据
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static bool IsNumber(string text)
{
return Regex.IsMatch(text, @"d+");
}

/// <summary>
/// 获取字符串中的所有数字
/// </summary>
/// <param name="par"></param>
/// <returns></returns>
public static int GetNumber(string par)
{
string strTempContent = par;
strTempContent = System.Text.RegularExpressions.Regex.Replace(strTempContent, @"[^d]*", "");
return Convert.ToInt32(strTempContent);
}

/// <summary>
/// 获取字符串中的所有数字,以逗号隔开
/// </summary>
/// <param name="par"></param>
/// <returns></returns>
public static string GetNumberSplit(string par)
{
string strTempContent = par;
strTempContent = System.Text.RegularExpressions.Regex.Replace(strTempContent, @"D+", ",");
return strTempContent;
}

/// <summary>
/// 获取逾期记录表
/// </summary>
/// <param name="text"></param>
/// <returns></returns>
public static List<string> GetListOverDueRecord(string text)
{
string tempTableString = ComPDFHepler.GetStartWithValue(text, "逾期金额 ");
var arrayString = tempTableString.Split(" ".ToCharArray());
List<string> list = new List<string>();
foreach (string item in arrayString)
{
if (!string.IsNullOrEmpty(item))
{
list.Add(item);
}

}
return list;
}

/// <summary>
/// 是否为Month个月内
/// </summary>
/// <param name="date"></param>
/// <returns></returns>
public static bool isInMonth(string date, int month)
{
string dateFormat = date.Replace('.', '-');
try
{
DateTime dt = DateTime.Parse(dateFormat);
double days = DateTime.Now.Subtract(dt).TotalDays;
if (days / 30 <= month)
{
return true;
}
else
{
return false;
}

}
catch (Exception)
{
return false;
}

}
}

原文地址:https://www.cnblogs.com/wolf12/p/5439837.html