LuceneNet 搜索一

1.引用读取PDF文件组件

   FontBox-0.1.0-dev.dll   

   IKVM.GNU.Classpath.dll

   IKVM.Runtime.dll

   PDFBox-0.7.3.dll

2.添加office 组件 这个就过吧

3.添加盘古分词

  PanGu.dll

  PanGu.HighLight.dll

  PanGu.Lucene.Analyzer.dll

4.添加Lucene.net 引用

  Lucene.Net.dll

5.创建索引库

 1   #region  同步资料到索引库
 2         static Queue<ResourcesModel> TaskQueue = new Queue<ResourcesModel>();
 3         private void tmResSync_Tick(object sender, EventArgs e)
 4         {
 5             //读取数据到队列
 6             try
 7             {
 8                 InitTaskQueue();
 9                 LogHelper.writeLog("WinFrom【同步数据索引库读取数据到队列】成功!!");
10             }
11             catch (Exception ex)
12             {
13                 LogHelper.writeErrLog("WinFrom【同步数据索引库读取数据到队列】:" + ex.Message);
14             }
15         }
16         public void ServiceStart()
17         {
18             Thread TaskThread = new Thread(new ThreadStart(ThreadInvoke));
19             TaskThread.IsBackground = true;
20             TaskThread.Start();
21         }
22         public void ThreadInvoke()
23         {
24             while (true)
25             {
26                 try
27                 {
28                     if (TaskQueue.Count > 0)
29                     {
30                         ResourcesModel res = null;
31                         lock (TaskQueue)
32                         {
33                             res = TaskQueue.Dequeue();
34                         }
35                         //调用方法
36                         new CreateResIndex().CreateIndex(res);
37                     }
38                     else
39                     {
40                         Thread.Sleep(1000);
41                     }
42                 }
43                 catch (Exception ex)
44                 {
45                     LogHelper.writeErrLog("WinFrom【同步数据索引库出错】:"+ex.ToString());
46                 }
47             }
48         }
49         public void InitTaskQueue()
50         {
51             //读取资料中心数据
52            var query = new CreateResIndex().Get_View_CreateResIndex(" and uploadTime is  not  null  and  IsIndex=0 ");
53            if (query!=null)
54            {
55                     for (int i = 0; i < query.Rows.Count; i++)
56                     {
57                         var  model  =new  ResourcesModel();
58                         model.ID  =query.Rows[i]["ID"].ToString();
59                         model.FileName=query.Rows[i]["FileName"]!=null ? query.Rows[i]["FileName"].ToString():"";
60                         model.FilePath=query.Rows[i]["FilePath"]!=null ? query.Rows[i]["FilePath"].ToString():"";
61                         model.CreaetBy=query.Rows[i]["UserName"]!=null ? query.Rows[i]["UserName"].ToString():"";
62                         model.Types=query.Rows[i]["Name"]!=null ? query.Rows[i]["Name"].ToString():"";
63                         model.TypeId=query.Rows[i]["Type"]!=null ? query.Rows[i]["Type"].ToString():"";
64                         model.SimpleDesc=query.Rows[i]["SimpleDesc"]!=null ? query.Rows[i]["SimpleDesc"].ToString():"";
65                         model.Title=query.Rows[i]["Title"]!=null ? query.Rows[i]["Title"].ToString():"";
66                         model.Tags=query.Rows[i]["Tag"]!=null ? query.Rows[i]["Tag"].ToString():"";
67                         model.OP = query.Rows[i]["IsDel"] != null && query.Rows[i]["IsDel"].ToString()!="" ? Convert.ToBoolean(query.Rows[i]["IsDel"].ToString())==true ? "0" : "1":"1";
68                         model.UploadTime = query.Rows[i]["uploadTime"] != null && query.Rows[i]["uploadTime"].ToString() != "" ?  Convert.ToDateTime(query.Rows[i]["uploadTime"]).ToString("yyyy-MM-dd"):"";
69                         TaskQueue.Enqueue(model);
70                     }
71            }
72 
73 
74         }
75         #endregion
View Code
 1  #region  ResourcesModel
 2     public class ResourcesModel
 3     {
 4         public ResourcesModel() { }
 5 
 6         /// <summary>
 7         /// 标识
 8         /// </summary>
 9         public string ID { get; set; }
10 
11         /// <summary>
12         /// 标题
13         /// </summary>
14         public string Title { get; set; }
15 
16         /// <summary>
17         ///标签
18         /// </summary>
19         public string Tags { get; set; }
20 
21         /// <summary>
22         ///创建人
23         /// </summary>
24         public string CreaetBy { get; set; }
25 
26         /// <summary>
27         ///上传时间
28         /// </summary>
29         public string UploadTime { get; set; }
30 
31         /// <summary>
32         ///类别
33         /// </summary>
34         public string Types { get; set; }
35 
36         /// <summary>
37         ///简介
38         /// </summary>
39         public string SimpleDesc { get; set; }
40         /// <summary>
41         ///内容
42         /// </summary>
43         public string ContextDesc { get; set; }
44         /// <summary>
45         ///  有来标注是 删除=0  增加=1   修改=2
46         /// </summary>
47         public string OP { get; set; }
48         /// <summary>
49         /// 类型Id
50         /// </summary>
51         public string TypeId { get; set; }
52         /// <summary>
53         /// 文件路径
54         /// </summary>
55         public string FilePath { get; set; }
56         /// <summary>
57         /// 文件名称
58         /// </summary>
59         public string FileName { get; set; }
60     }
61     #endregion
View Code
  1  #region 读取文件
  2     public class ReadFilesTxt
  3     {
  4         public string ResumeTxt(string path)
  5         {
  6             string str = string.Empty;
  7 
  8             StreamReader reader = new StreamReader(path, System.Text.Encoding.Default);
  9             str = reader.ReadToEnd();
 10 
 11             //再通过查询解析出来的的字符串有没有GB2312 的字段,来判断是否是GB2312格式的,如果是,则重新以GB2312的格式解析
 12             System.Text.RegularExpressions.Regex reGB = new System.Text.RegularExpressions.Regex("GB2312", RegexOptions.IgnoreCase);
 13             System.Text.RegularExpressions.Match mcGB = reGB.Match(str);
 14             if (mcGB.Success)
 15             {
 16                 StreamReader reader2 = new StreamReader(path, System.Text.Encoding.GetEncoding("GB2312"));
 17                 str = reader2.ReadToEnd();
 18             }
 19             return str;
 20         }
 21 
 22         private string ResumeWord(string path)
 23         {
 24             string str = string.Empty;
 25             object missing = System.Reflection.Missing.Value;
 26             object readOnly = true;
 27             object docPathp = path;
 28             Microsoft.Office.Interop.Word.Application wordApp = new Microsoft.Office.Interop.Word.Application();
 29 
 30             Microsoft.Office.Interop.Word.Document wordDoc = wordApp.Documents.Open(ref docPathp,
 31                 ref missing,
 32                 ref readOnly,
 33                 ref missing,
 34                 ref missing,
 35                 ref missing,
 36                 ref missing,
 37                 ref missing,
 38                 ref missing,
 39                 ref missing,
 40                 ref missing,
 41                 ref missing,
 42                 ref missing,
 43                 ref missing,
 44                 ref missing,
 45                 ref missing);
 46             str = wordDoc.Content.Text;
 47             wordDoc.Close(ref missing, ref missing, ref missing);
 48             wordApp.Quit(ref missing, ref missing, ref missing);
 49 
 50             return str;
 51         }
 52 
 53         private string ResumeExcel(string path)
 54         {
 55             string str = string.Empty;
 56             //创建Application对象
 57             Microsoft.Office.Interop.Excel.Application xApp = new Microsoft.Office.Interop.Excel.Application();
 58             xApp.Visible = false;
 59             object readOnly = true;
 60             object missing = System.Reflection.Missing.Value;
 61             ////得到WorkBook对象,
 62             Microsoft.Office.Interop.Excel.Workbook xBook = xApp.Workbooks._Open(path,
 63                   missing, readOnly, missing, missing,
 64                   missing, missing, missing, missing,
 65                   missing, missing, missing, missing);
 66 
 67             var count = xBook.Sheets.Count;
 68             Microsoft.Office.Interop.Excel.Worksheet xSheet;
 69             for (int k = 0; k < count; k++)
 70             {
 71                 xSheet = (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[k + 1];
 72                 var rcount = xSheet.UsedRange.Rows.Count;
 73                 var ccount = xSheet.UsedRange.Columns.Count;
 74 
 75                 for (int m = 0; m < rcount; m++)
 76                 {
 77                     for (int n = 0; n < ccount; n++)
 78                     {
 79                         str = str + ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m + 1, n + 1]).Value2;
 80                     }
 81                 }
 82             }
 83 
 84             xSheet = null;
 85             xBook.Close(missing, missing, missing);
 86             xApp.Quit();
 87 
 88             return str;
 89         }
 90 
 91         public string ResumePDF(string path)
 92         {
 93 
 94             org.pdfbox.pdmodel.PDDocument doc = org.pdfbox.pdmodel.PDDocument.load(path);
 95 
 96             org.pdfbox.util.PDFTextStripper pdfStripper = new org.pdfbox.util.PDFTextStripper();
 97 
 98             string text = pdfStripper.getText(doc);
 99 
100             return text;
101 
102         }
103 
104         public string GetReadContext(string ResourceRoute, string path)
105         {
106             StringBuilder sb = new StringBuilder();
107 
108             try
109             {
110                 if (path != "")
111                 {
112                     string[] paths = path.Split(';');
113                     for (int i = 0; i < paths.Length; i++)
114                     {
115                         if (paths[i] != null && paths[i].ToString() != "")
116                         {
117 
118                             string lpath = paths[i].ToString();
119                             var suffix = lpath.Substring(lpath.LastIndexOf(".") + 1, lpath.Length - lpath.LastIndexOf(".") - 1);
120                             if ("doc" == suffix || "docx" == suffix)
121                             {
122                                 sb.Append(ResumeWord(ResourceRoute + lpath));
123                             }
124                             else if ("xls" == suffix || "xlsx" == lpath)
125                             {
126                                 sb.Append(ResumeExcel(ResourceRoute + lpath));
127                             }
128                             else if ("pdf" == suffix)
129                             {
130                                 sb.Append(ResumePDF(ResourceRoute + lpath));
131                             }
132                             else if ("txt" == suffix)
133                             {
134                                 sb.Append(ResumeTxt(ResourceRoute + lpath));
135                             }
136 
137 
138                         }
139                     }
140                 }
141             }
142             catch (Exception ex)
143             {
144 
145                 LogHelper.writeErrLog( "【读取文件出错:文件名称:" + path + " 】 错误消息:" + ex.Message.ToString());
146             }
147 
148             return sb.ToString();
149         }
150 
151         public string GetReadContextSingle(string ResourceRoute, string lpath)
152         {
153             StringBuilder sb = new StringBuilder();
154             try
155             {
156                 if (lpath != "")
157                 {
158                     var suffix = lpath.Substring(lpath.LastIndexOf(".") + 1, lpath.Length - lpath.LastIndexOf(".") - 1);
159                     if ("doc" == suffix || "docx" == suffix)
160                     {
161                         sb.Append(ResumeWord(ResourceRoute + lpath));
162                     }
163                     else if ("xls" == suffix || "xlsx" == lpath)
164                     {
165                         sb.Append(ResumeExcel(ResourceRoute + lpath));
166                     }
167                     else if ("pdf" == suffix)
168                     {
169                         sb.Append(ResumePDF(ResourceRoute + lpath));
170                     }
171                     else if ("txt" == suffix)
172                     {
173                         sb.Append(ResumeTxt(ResourceRoute + lpath));
174                     }
175                 }
176             }
177             catch (Exception ex)
178             {
179 
180                 LogHelper.writeErrLog("【读取文件出错:文件名称:" + ResourceRoute + lpath + " 】 错误消息:" + ex.Message.ToString());
181             }
182 
183             return sb.ToString();
184         }
185     }
186 #endregion 
View Code
  1 #region  创建索引
  2     public class CreateResIndex
  3     {
  4         public static string IndexPath = ConfigurationManager.AppSettings["pathIndex"];//索引文件路径
  5         public static string ResourceRoute = ConfigurationManager.AppSettings["ResourceRoute"];//文件路径
  6 
  7         //   private readonly ILog log = LogManager.GetLogger("CreateIndex");
  8 
  9         #region 属性
 10         /// <summary>
 11         /// 盘古分词器
 12         /// </summary>
 13         protected Analyzer NewPanGuAnalyzer
 14         {
 15             get { return new PanGuAnalyzer(); }
 16 
 17         }
 18 
 19         /// <summary>
 20         /// Lucene.Net的目录-参数
 21         /// </summary>
 22         public FSDirectory DirectoryLuce
 23         {
 24             get
 25             {
 26                 return FSDirectory.Open(new DirectoryInfo(IndexPath), new NativeFSLockFactory());
 27             }
 28         }
 29         #endregion
 30 
 31         #region  创建索引
 32         /// <summary>
 33         ///创建索引
 34         /// </summary>
 35         public void CreateIndex(ResourcesModel res)
 36         {
 37             //创建索引目录
 38             if (!System.IO.Directory.Exists(IndexPath))
 39             {
 40                 System.IO.Directory.CreateDirectory(IndexPath);
 41             }
 42 
 43             //FSDirectory directory = FSDirectory.Open(new DirectoryInfo(IndexDic), new NativeFSLockFactory());
 44             bool isUpdate = IndexReader.IndexExists(DirectoryLuce);
 45             if (isUpdate)
 46             {
 47                 if (IndexWriter.IsLocked(DirectoryLuce))
 48                 {
 49                     IndexWriter.Unlock(DirectoryLuce);
 50                 }
 51             }
 52 
 53             IndexWriter writer = new IndexWriter(DirectoryLuce, NewPanGuAnalyzer, !isUpdate, IndexWriter.MaxFieldLength.UNLIMITED);
 54             List<string> listIsdex = GetResourceTypePublicResources();
 55             List<string> modifyindex = new List<string>();
 56             if (res != null)
 57             {
 58                 if (res.OP == "0")
 59                 {
 60                     writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim()));
 61                     modifyindex.Add(res.ID.ToString().Trim());
 62                     LogHelper.writeLog("【删除索引编号】 【ID:" + res.ID.ToString().Trim() + "");
 63                 }
 64                 else
 65                 {
 66                    
 67                     if (IsPublicResources(listIsdex, res.TypeId.Trim()))
 68                     {
 69 
 70                         writer.DeleteDocuments(new Term("ID", res.ID.ToString().Trim()));
 71 
 72                         var path = res.FilePath;
 73 
 74                         string ID = res.ID.ToString().Trim();
 75                         string Title = res.Title != null ? res.Title.ToString() : "";
 76 
 77                         string CreaetBy = res.CreaetBy == null ? "" : res.CreaetBy.ToString();
 78                         string UploadTime = res.UploadTime;
 79                         string Types = res.Types != null ? res.Types.ToString() : "";
 80                         string SimpleDesc = res.SimpleDesc == null ? "" : res.SimpleDesc.ToString();
 81                         string Tags = res.Tags != null ? res.Tags.ToString() : "";
 82                         string FileName = res.FileName;
 83                         try
 84                         {
 85                             string ContextDesc = "";
 86                             AddIndex(writer, ID, Title, Tags, SimpleDesc, "1", Types, UploadTime, CreaetBy, FileName);
 87                             if (path != "")
 88                             {
 89                                 string[] paths = path.Split(';');
 90                                 string[] pname = FileName.Split(';');
 91                                 for (int i = 0; i < paths.Length; i++)
 92                                 {
 93                                     if (paths[i] != null && paths[i].ToString() != "")
 94                                     {
 95                                         string lpath = paths[i].ToString();
 96                                         string lname = pname[i].ToString();
 97                                         ContextDesc=  new ReadFilesTxt().GetReadContextSingle(ResourceRoute, lpath);
 98                                         //SimpleDesc=ContextDesc ContextDesc=""
 99                                        string   NewFileName = GetFileName(lpath, lname);
100                                        AddIndex(writer, ID, NewFileName, Tags, ContextDesc, lpath, Types, UploadTime, CreaetBy, FileName);
101                                     }
102                                 }
103                             }
104                             
105                            // string ContextDesc = new ReadFilesTxt().GetReadContext(ResourceRoute,path);
106                             LogHelper.writeLog("【添加索引编号】 【ID:" + res.ID.ToString().Trim() + "");
107                             modifyindex.Add(ID);
108                         }
109                         catch (Exception ex)
110                         {
111                             LogHelper.writeLog("【添加索引失败】 【ID:" + ID + "】:" + ex.Message.ToString());
112 
113                         }
114 
115                     }
116                 }       
117             }
118             writer.Optimize();
119             writer.Close();
120             ModifyResIndex(modifyindex);
121         }
122 
123         public void AddIndex(IndexWriter writer, string ID, string Title, string Tags, string SimpleDesc, string ContextDesc, string Types, string UploadTime, string CreaetBy,string FileName)
124         {
125             try
126             {
127                 Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
128                 doc.Add(new Lucene.Net.Documents.Field("ID", ID, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
129                 doc.Add(new Lucene.Net.Documents.Field("Title", Title, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
130                 doc.Add(new Lucene.Net.Documents.Field("Tags", Tags, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
131                 doc.Add(new Lucene.Net.Documents.Field("SimpleDesc", SimpleDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
132                 doc.Add(new Lucene.Net.Documents.Field("FileName", FileName, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
133                 doc.Add(new Lucene.Net.Documents.Field("ContextDesc", ContextDesc, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
134                 doc.Add(new Lucene.Net.Documents.Field("Types", Types, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.ANALYZED));//存储且索引
135                 doc.Add(new Lucene.Net.Documents.Field("UploadTime", UploadTime, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NOT_ANALYZED));//存储且索引
136                 doc.Add(new Lucene.Net.Documents.Field("CreaetBy", CreaetBy, Lucene.Net.Documents.Field.Store.YES, Lucene.Net.Documents.Field.Index.NO));
137                 writer.AddDocument(doc);
138             }
139             catch (FileNotFoundException fnfe)
140             {
141                 throw fnfe;
142             }
143             catch (Exception ex)
144             {
145                 throw ex;
146             }
147         }
148         public string GetFileName(object objfilepath, object FileName)
149         {
150             string result = "";
151             if (FileName != null && FileName.ToString() != "")
152             {
153                 result = FileName.ToString();
154             }
155             else
156             {
157                 if (objfilepath != null && objfilepath.ToString() != "")
158                 {
159                     string filename = objfilepath.ToString().Substring(objfilepath.ToString().LastIndexOf(',') + 1).Replace(";", "");
160                     result = filename;
161                 }
162             }
163             return result;
164         }
165         #endregion
166 
167         #region 获取数据库数据
168         /// <summary>
169         /// 获取中心资料库数据
170         /// </summary>
171         /// <param name="whereStr"></param>
172         /// <returns></returns>
173         public DataTable Get_View_CreateResIndex(string whereStr)
174         {
175             string sql = " Select * From Res_View_createResIndex where 1=1  " + whereStr;
176             DataTable dt = new DataTable();
177 
178             try
179             {
180                 DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
181                 if (ds != null && ds.Tables != null && ds.Tables.Count > 0)
182                 {
183                     dt = ds.Tables[0];
184                 }
185             }
186             catch (Exception ex)
187             {
188                 LogHelper.writeLog("【 获取中心资料库数据错误】:" + ex.ToString());
189             }
190             return dt;
191         }
192 
193         public void ModifyResIndex(List<string> list)
194         {
195             string sql = " update  ResourceInfoNew set  IsIndex=1 where id in ({0}) ";
196             StringBuilder sb = new StringBuilder("'-1'");
197             //Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
198             LogHelper.writeLog("【更新索引编号开始】:" + string.Join(",", list.ToArray()));
199             if (list.Count > 0)
200             {
201                 for (int i = 0; i < list.Count; i++)
202                 {
203                     sb.Append(",'" + list[i].ToString() + "'");
204                 }
205                 sql = string.Format(sql, sb.ToString());
206                 int result = Ruihua.Common.DbHelperSQL.ExecuteSql(sql);
207                 LogHelper.writeLog("【更新索引编号结束:" + result.ToString() + "】:" + string.Join(",", list.ToArray()));
208             }
209         }
210 
211         /// <summary>
212         /// 判断是否公共资源
213         /// </summary>
214         /// <returns></returns>
215         public bool IsPublicResources(List<string> list, string Id)
216         {
217 
218             if (list.Contains(Id))
219             {
220                 return true;
221             }
222             return false;
223 
224         }
225         public List<string> GetResourceTypePublicResources()
226         {
227             ObjectCache cache = MemoryCache.Default;
228             List<string> ResourceType = cache["ResourceType"] as List<string>;
229             List<string> publicresource = new List<string>();
230             if (ResourceType == null)
231             {
232 
233                // Ruihua.Common.DbHelperSQL.connectionString = ConfigurationManager.AppSettings["ResConStr"].ToString();
234                 string sql = "select *From ResourceType ";
235                 DataSet ds = Ruihua.Common.DbHelperSQL.Query(sql);
236                 if (ds != null && ds.Tables != null && ds.Tables.Count > 0)
237                 {
238                     DataTable dt = ds.Tables[0];
239                     var query1 = from q1 in dt.AsEnumerable()
240                                  where q1.Field<string>("ParentID") == "0"
241                                  select q1;
242                     if (query1 != null)
243                     {
244                         foreach (var item in query1)
245                         {
246                             publicresource.Add(item.Field<string>("TID").Trim());
247                             //第二层
248                             AddListString(ref publicresource, dt, item.Field<string>("TID").Trim());
249                         }
250                     }
251                 }
252                 CacheItemPolicy policy = new CacheItemPolicy();
253                 policy.AbsoluteExpiration = DateTimeOffset.Now.AddSeconds(1800.0);//属性设置为 60*30 秒后逐出缓存 
254                 cache.Set("ResourceType", publicresource, policy);
255             }
256             else
257             {
258                 publicresource = ResourceType;
259             }
260             return publicresource;
261 
262         }
263         public void AddListString(ref  List<string> list, DataTable dt, string Id)
264         {
265             var query2 = from q2 in dt.AsEnumerable()
266                          where q2.Field<string>("ParentID") == Id
267                          select q2;
268             if (query2 != null)
269             {
270                 foreach (var item in query2)
271                 {
272                     list.Add(item.Field<string>("TID").Trim());
273                     AddListString(ref list, dt, item.Field<string>("TID").Trim());
274                 }
275             }
276         }
277 
278 
279 
280 
281         #endregion
282 
283 
284 
285     }
286 
287 #endregion
View Code
原文地址:https://www.cnblogs.com/linsu/p/4939550.html