初探站内搜索(中)

接上篇

首先我们配置global.asax.在Application_Start中添加如下代码。创建日志,和定时建立索引任务。

1 protected void Application_Start(object sender, EventArgs e)
2 {
3 log4net.Config.XmlConfigurator.Configure();//log4net起初配置,注意web.config里面改
4   Segment.Init(HttpContext.Current.Server.MapPath("~/PanGu.xml"));//采用配置文件来初始化
5  
6 ISchedulerFactory sf = new StdSchedulerFactory(); //创建计划工厂
7 IScheduler sched = sf.GetScheduler(); //创建任务的执行者
8 JobDetail job = new JobDetail("job1", "group1", typeof(Index_Job));//IndexJob为实现了IJob接口的类
9 DateTime ts = TriggerUtils.GetNextGivenSecondDate(null, 1); //1秒后开始执行
10 TimeSpan interval = TimeSpan.FromHours (1);//每隔1小时运行一次,这里的方法是可以改的
11 Trigger trigger = new SimpleTrigger("trigger1", "group1", "job1", "group1", ts, null,
12 SimpleTrigger.RepeatIndefinitely, interval);//创建触发器
13 sched.AddJob(job, true);
14 sched.ScheduleJob(trigger);
15 sched.Start(); //启动后任务开始计划
16
17 }

然后我们进入Index_Job类中写它的定时任务代码:注意继承IJob接口把计划任务的执行代码写到Execute方法里面。

索引完成后需要索引库的文件都被lucene.net扔到Index文件夹下。

1 public class Index_Job:IJob
2 {
3 private static ILog log = LogManager.GetLogger(typeof(Index_Job));
4
5 #region IJob 成员
6
7 public void Execute(JobExecutionContext context)
8 {
9 //string indexPath = HttpContext.Current.Server.MapPath("~/Index");
10
11 string indexPath = HostingEnvironment.MapPath("~/Index");
12 log.Debug("开始创建索引,索引目录:" + indexPath);
13 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
14 bool isUpdate = IndexReader.IndexExists(directory);
15 log.Debug("索引目录存在状态:" + isUpdate);
16 if (isUpdate)
17 {
18 if (IndexWriter.IsLocked(directory))
19 {
20 log.Debug("解锁索引库");
21 IndexWriter.Unlock(directory);
22 }
23 }
24 log.Debug("开始爬文章");
25 IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
26 string siteURL = ConfigurationManager.AppSettings["SiteURL"];
27 //for (int i = 1; i <= GetMaxId(siteURL); i++)
28
29 for (int i = 900; i <= 1000; i++)
30 {
31 log.Debug("开始爬编号为" + i.ToString() + "的帖子");
32 try
33 {
34 WebClient wc = new WebClient();
35 wc.Encoding = Encoding.UTF8;//
36 string url = siteURL + "showtopic-" + i + ".aspx";
37 //string url = "http://localhost:8081/showtopic-" + i + ".aspx";
38 string txt = wc.DownloadString(url);
39 HTMLDocumentClass htmldoc = new HTMLDocumentClass();
40 htmldoc.designMode = "on"; //这样就不解析javascript了
41 htmldoc.IHTMLDocument2_write(txt);
42 string title = htmldoc.title;
43 string bodyText = htmldoc.body.innerText;
44
45 writer.DeleteDocuments(new Term("url", url));//删除旧的数据,以url键为主键,这样就避免重复
46
47 Document document = new Document();
48 document.Add(new Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED));
49 document.Add(new Field("title", title, Field.Store.YES, Field.Index.NOT_ANALYZED));
50 document.Add(new Field("body", bodyText, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
51
52 writer.AddDocument(document);
53 log.Debug("爬编号为" + i.ToString() + "的帖子结束");
54 }
55 catch (Exception ex)
56 {
57 log.Error("爬编号为" + i.ToString() + "的帖子发生异常", ex);
58 }
59 }
60 log.Debug("结束索引,开始关闭Writer和Directory");
61 writer.Close();
62 directory.Close();
63 log.Debug("关闭Writer和Directiory完成");
64 //ClientScript.RegisterStartupScript(GetType(), "alert", "alert('索引完成')", true);
65 }
66
67 #endregion
68
69
70 private static int GetMaxId(string siteURL) //获得最新的id帖子编号
71 {
72 WebClient wc = new WebClient();
73 wc.Encoding = Encoding.UTF8;
74 string html = wc.DownloadString(siteURL + "tools/rss.aspx");
75 XDocument doc = XDocument.Parse(html);
76 string link = doc.Descendants("item").First().Element("link").Value;
77 System.Text.RegularExpressions.Regex regex = new Regex(@"showtopic-(\d+)");
78 Match match = regex.Match(link);
79 string id = match.Groups[1].Value;
80
81 return Convert.ToInt32(id);
82 }
83
84
85
86 }

Default.aspx页面前台html代码(表单form中):

1 <form id="form1" action="Default.aspx" method ="get">
2 <div>
3
4 <input type="text" id="txtKw" value="<%=Request["kw"] %>" name ="kw" />
5 <script type="text/javascript" >
6 $("#txtKw").autocomplete({
7 source:"SearchSuggestion.ashx" , select:function (e, ui) {
8 $("#txtKw").val(ui.item.value);
9 $("#sb").click();
10 } //自动发出Ajax请求
11 });
12 </script>
13
14 <input type="submit" id="sb" value="搜索" />
15
16 </div>
17 <asp:Repeater ID="RepeaterResult" runat="server">
18 <ItemTemplate>
19 <a href='<%#Eval("URL")%>' > <%#Eval("TITLE")%> </a>
20 <br />
21 <p>
22 <%#Eval("BODY") %>
23 </p>
24 </ItemTemplate>
25 </asp:Repeater>
26 </form>

注意把viewstate禁用了,防止最后生成的客户端html代码中有一大堆viewstate的东东,显得我们不专业 呵呵~(EnableViewState ="false")

我们知道一旦禁用viewstate所有跑在服务端的基本控件都不能用了,那些数控绑定和链接控件除外,所以我们回归原始的html。用get方法提交表单。

因此后台代码我们就在pageload方法中实现,通过用户输入的关键字Request["kw"]是否为空提交表单后判断是否加载运行。后台处理代码如下:(其中有注释我就不详细解释了)

1 public partial class _Default : System.Web.UI.Page
2 {
3 protected void Page_Load(object sender, EventArgs e)
4 {
5 if (string.IsNullOrEmpty(Request["kw"]))
6 {
7 return;
8 }
9 else
10 {
11
12 string kw = Request["kw"];
13 new SearchLogTableAdapter().Insert(Guid.NewGuid(), DateTime.Now, Request.UserHostAddress, kw); //把关键词插入数据库
14
15 string indexPath = Server.MapPath("~/Index"); //获得要搜索的文本路径(已经被lucene建立好的文本的索引文件)
16 FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
17 IndexReader indexReader = IndexReader.Open(directory, true);
18 IndexSearcher searcher = new IndexSearcher(indexReader);//加入搜索者
19 PhraseQuery query = new PhraseQuery();//查询条件
20 foreach (string word in segString(kw)) //分词
21 {
22 query.Add(new Term("body", word)); //加入分词的查询条件
23
24 }
25 query.SetSlop(1000); //相邻1000个字有效
26
27 TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);//创建收集器最多收集1000个文本
28 searcher.Search(query, null, collector);//开始查询,使用query条件,结果放入collector中
29 TopDocs topDocs = collector.TopDocs();//获得结果
30 int sum = collector.GetTotalHits(); //得到结果条数
31 List<SearchResult> list = new List<SearchResult>();
32 foreach (ScoreDoc scoreDoc in topDocs.scoreDocs)
33 {
34
35 int docId = scoreDoc.doc;//拿到搜到的文档ID
36 Document document = searcher.Doc(docId);//根据文档ID创建DOCUMENT
37 string url = document.Get("url");
38 string title = document.Get("title");
39 string body = document.Get("body");
40
41 SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); //设置高亮,还可以实现其他功能。提示之类的~~
42 Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new Segment());
43 highlighter.FragmentSize = 200;
44 body = highlighter.GetBestFragment(kw, body); //把关键字设为高亮
45
46 SimpleHTMLFormatter simpleHtmlFormatter1 = new SimpleHTMLFormatter("<font style='background-color:red'>", "</font>"); //把标题背景设为红
47 Highlighter highlighter1 = new Highlighter(simpleHtmlFormatter1, new Segment());
48 highlighter1.FragmentSize = 200;
49 title = highlighter1.GetBestFragment(title, title);
50
51 SearchResult result = new SearchResult()
52 {
53 URL = url,
54 TITLE = title,
55 BODY = body
56 };
57 list.Add(result);
58
59 }
60 searcher.Close();
61 indexReader.Close();
62 directory.Close();
63 RepeaterResult.DataSource = list;
64 RepeaterResult.DataBind();
65 }
66
67 }
68 private static string[] segString(string s)
69 {
70 Segment segment = new Segment();
71 return (from wordInfo in segment.DoSegment(s) select wordInfo.Word).ToArray();
72 }
73
74 }
75 public class SearchResult
76 {
77 public string URL { get; set; }
78 public string TITLE { get; set; }
79 public string BODY { get; set; }
80 }

接下去我们实现输入关键词自动补全。用JQueryUI和AJAX请求后台关键词效果。

未完,待续。。。

原文地址:https://www.cnblogs.com/lys_013/p/1851963.html