Java應(yīng)用開源框架實現(xiàn)簡易web搜索引擎

更新時間：2017年11月28日 10:48:34 作者：lannooooooooooo

本篇文章主要介紹了Java應(yīng)用開源框架實現(xiàn)簡易web搜索引擎，小編覺得挺不錯的，現(xiàn)在分享給大家，也給大家做個參考。一起跟隨小編過來看看吧

引言

應(yīng)用 Java 的開源庫，編寫一個搜索引擎，這個引擎能爬取一個網(wǎng)站的內(nèi)容。并根據(jù)網(wǎng)頁內(nèi)容進(jìn)行深度爬取，獲取所有相關(guān)的網(wǎng)頁地址和內(nèi)容，用戶可以通過關(guān)鍵詞，搜索所有相關(guān)的網(wǎng)址。

具體功能

(1) 用戶可以指定爬取一個url對應(yīng)的網(wǎng)頁的內(nèi)容。
(2) 對網(wǎng)頁內(nèi)容進(jìn)行解析，并獲取其中所有的url鏈接地址。
(3) 用戶可以設(shè)定爬取深度，代表著從初始url對應(yīng)的頁面開始，可以爬取其中所有的url對應(yīng)的網(wǎng)頁內(nèi)的url，以此類推。深度越大，能爬取到的網(wǎng)站越多。
(4) 對爬取到的url內(nèi)容進(jìn)行保存、建立索引。建立索引的內(nèi)容是url地址本身，和url對應(yīng)的網(wǎng)頁標(biāo)題。
(5) 用戶可以通過關(guān)鍵詞對網(wǎng)址進(jìn)行搜索，找出有該關(guān)鍵詞的url地址。
(6) 建立索引和搜索索引的過程能智能識別中文關(guān)鍵詞，能對關(guān)鍵詞進(jìn)行分詞操作。
(7) 用戶可以指定保存索引的地址、初始url、爬取深度、進(jìn)行搜索的關(guān)鍵詞和最大匹配項。

開源框架

Lucene
Jsoup

源碼

爬蟲部分：Spider.java

package webCrawler.Spider;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Scanner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import webCrawler.Index.BuildIndex;

/**
 * @author lannooo
 */

public class Spider {
  ArrayList<String> URLs;
  private String startURL;
  private int digLevel;

  /**
   * @param startURL 爬蟲的起始URL
   * @param digLevel 爬取深度
   */
  public Spider(String startURL, int digLevel){
    this.startURL = startURL;
    this.digLevel = digLevel;
    this.URLs = new ArrayList<>();
  }

  /**
   * @param level 當(dāng)前爬取的深度剩余
   * @param arrayList 需要進(jìn)行下一輪爬去的URL集
   * @return 從一格url集爬取到的新的URL集
   * @throws IOException
   */
  public ArrayList<String> getLevelURLs(int level, ArrayList<String> arrayList) 
      throws IOException{
    ArrayList<String> total = null;
    if(level>0){      
      total = new ArrayList<>();
      for(String url: arrayList){
        /*對于每個arrayList中的URL，首先解析其網(wǎng)頁內(nèi)容，并獲得里面所有URL項*/
        for(String each: getBareLinks(url)){
          total.add(each);
        }
      }
      /*用HashSet這個容器將total里面重復(fù)項刪除*/
      HashSet<String> hashSet = new HashSet<>(total);
      total = new ArrayList<>(hashSet);
    }
    return total;
  }

  /**
   * 從startURL開始，爬取所有相關(guān)URLs
   * @throws IOException
   */
  public void getAll() throws IOException{
    ArrayList<String> newURLs;
    ArrayList<String> currentURLs = new ArrayList<>();
    /*把startURL加入currentURLs這個列表中，從這個url開始爬*/
    currentURLs.add(startURL);
    for(int i=digLevel; i>0; i--){
      /*
       * 對于每一層，都要獲取一次由這個url引申出去的url集
       * 然后把當(dāng)前集的已經(jīng)爬去過的url加入到總的URL集中
       * 最后newURLs作為新的需要進(jìn)行深度爬取的集進(jìn)入下一輪循環(huán)
       */
      System.out.println("Dig into level: " + (digLevel-i+1));
      newURLs = getLevelURLs(i, currentURLs);
      for(String each: currentURLs){
        URLs.add(each);
      }
      currentURLs = newURLs;
    }
    for(String each:currentURLs){
      URLs.add(each);
    }
    HashSet<String> hashSet = new HashSet<>(URLs);
    URLs = new ArrayList<>(hashSet);
  }

  /**
   * @param path 保存索引的路徑
   * @throws IOException
   */
  public void storeURLsAndInfo(String path) throws IOException{
    BuildIndex build = new BuildIndex(path);
    /* 把URLs中的所有url進(jìn)行實際網(wǎng)頁標(biāo)題的爬取*/
    for(String each:URLs){
      String text = getLinkText(each);
      if(text!=null){
        build.addField("url", each);
        build.addField("text", text);
        /*將這一個entry加入索引中*/
        build.pushIndex();
      }
    }
    build.close();
  }

  /**
   * @param url 需要獲取網(wǎng)頁標(biāo)題的url
   * @return 標(biāo)題內(nèi)容
   * @throws IOException
   */
  public String getLinkText(String url) throws IOException{
    Document document = null;
    try {
      /*用Jsoup進(jìn)行連接，設(shè)置超時時間為3秒*/
      document = Jsoup.connect(url).timeout(3000).get();
    } catch (Exception e) {
      System.out.println("[TIMEOUT]Get title of url:"+url);
      return null;
    }
    String title = document.title();
    return title;
  }


  /**
   * @param url 進(jìn)行內(nèi)容解析的url
   * @return 返回該url的網(wǎng)頁內(nèi)容內(nèi)的所有urls列表
   * @throws IOException
   */
  public ArrayList<String> getBareLinks(String url) throws IOException{
    ArrayList<String> linksList = new ArrayList<>();
    Document document;

    try {
      document = Jsoup.connect(url).timeout(2000).get();
    } catch (Exception e) {
      return linksList;
    }
    /*獲取<body>標(biāo)簽理的所有帶href屬性的<a>標(biāo)簽*/
    Elements links = document.select("body").select("a[href]");

    for(Element link: links){
      /*從每一個解析得到的<a>標(biāo)簽中提取url，并去除錨點(diǎn)*/
      String href = link.attr("abs:href").replaceAll("#", "");
      /*只添加含有zju.edu.cn字符的url，去除末尾的'/'*/
      if(href.contains("zju.edu.cn")){
        if (href.endsWith("/")){
          href = href.substring(0, href.length()-1);
        }
        linksList.add(href);
      }
    }
    HashSet<String> hashSet = new HashSet<>(linksList);
    ArrayList<String> arrayList = new ArrayList<>(hashSet);

    return arrayList;
  }

  public static void main(String[] args) {
    Scanner in = new Scanner(System.in);
    System.out.println("Enter url:");
    String url = in.nextLine().trim();
    while(!url.startsWith("http://")){
      System.out.println("http:// is needed!");
      System.out.println("Enter url:");
      url = in.nextLine().trim();
    }
    System.out.println("Enter depth to dig more urls[<=3 recommended]：");
    int depth = in.nextInt();
    Spider spider = new Spider(url, depth);
    System.out.println("Enter path you want to save[default=d:/index-spider]:");
    String path = in.nextLine().trim();
    if(path.length()==0){
      path = "d:/index-spider";
    }
    try {
      System.out.println("Start fetching...");
      spider.getAll();
      System.out.println("Urls got success!");
      spider.storeURLsAndInfo(path);
      System.out.println("Stored success!");
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

 
}

建立索引：BuildIndex.java

package webCrawler.Index;

import java.io.*;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author lannooo
 *
 */
public class BuildIndex {
  private File file;
  private Directory directory;
  private IndexWriter indexWriter;
  private IndexWriterConfig config;
  private Analyzer analyzer;
  private Document document;

  /**
   * @param path 建立索引的路徑
   */
  public BuildIndex(String path) {
    try {
      file = new File(path);
      directory = FSDirectory.open(file);
      document = new Document();
      analyzer = new IKAnalyzer();    /*中文分詞工具類*/
      config = new IndexWriterConfig(Version.LUCENE_4_10_0, analyzer);
      indexWriter = new IndexWriter(directory, config);      

    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * @param fieldName 加入到document中的新的一項的名稱
   * @param fieldText 新的一項的內(nèi)容
   */
  public void addField(String fieldName, String fieldText){
    try{
      Field field = new TextField(fieldName, fieldText, Field.Store.YES);
      document.add(field);
    }catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * 將document加入到索引中
   */
  public void pushIndex(){
    try {
      indexWriter.addDocument(document);
      document = new Document();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * 加入完整的一個document并保存到索引中
   * @param url 加入的url地址
   * @param text url對應(yīng)的文本
   */
  public void addOneIndex(String url, String text){
    this.addField("url", url);
    this.addField("text", text);
    this.pushIndex();
  }

  /**
   * 關(guān)閉索引寫入
   */
  public void close(){
    try {
      indexWriter.close();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

}

搜索索引

package webCrawler.Index;

import java.io.File;
import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * @author lannooo
 *
 */
public class SearchIndex {
  private IndexSearcher indexSearcher;
  private Analyzer analyzer;
  private QueryParser parser;
  private Query query;
  private TopDocs hits;
  private DirectoryReader reader;

  /**
   * @param path 進(jìn)行索引搜索的路徑
   */
  public SearchIndex(String path){
    try {
      reader = DirectoryReader.open(FSDirectory.open(new File(path)));
      indexSearcher = new IndexSearcher(reader);
      analyzer = new IKAnalyzer();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  /**
   * @param fieldName 搜索的域名稱
   * @param text 搜索的內(nèi)容
   * @param matchNumber 最大匹配項數(shù)
   * @return 搜索到的最大匹配數(shù)
   */
  public int search(String fieldName, String text, int matchNumber){
    try {
      parser = new QueryParser(fieldName, analyzer);
      query = parser.parse(text);
      hits = indexSearcher.search(query, matchNumber);

      return hits.totalHits;
    } catch (Exception e) {
      e.printStackTrace();
    }
    return -1;
  }
  /**
   * 打印所有的匹配項
   */
  public void printHits(){
    try{
      System.out.println("Total hits number:"+hits.totalHits);
      for(ScoreDoc doc: hits.scoreDocs){
        Document document = indexSearcher.doc(doc.doc);
        System.out.println(document.get("url"));
        System.out.println(document.get("text"));
      }
      reader.close();
    }catch (Exception e) {
      e.printStackTrace();
    }
  }
  public static void main(String[] args) {
    /*輸入關(guān)鍵詞*/
    Scanner in = new Scanner(System.in);
    System.out.println("Enter path of the index:");
    String path = in.nextLine().trim();
    while(path.length()==0){
      System.out.println("Enter path of the index:");
      path = in.nextLine().trim();
    }

    System.out.println("Enter max hit number:");
    int max = in.nextInt();
    while(max<0){
      System.out.println("Enter max hit number:");
      max = in.nextInt();
    }
    in.nextLine();
    System.out.print("Search>>> ");
    String text = in.nextLine().trim();
    /*循環(huán)讀入用戶的關(guān)鍵詞，如果是q則退出，長度為0也退出*/
    while(!text.equals("q")){
      if(text.length()>0){
        SearchIndex search = new SearchIndex(path);
        int hits = search.search("text", text, max);
        if(hits!=-1){
          search.printHits();
        }
      }
      System.out.print("Search>>> ");
      text = in.nextLine().trim();
    }
  }
}

UI界面（這里為了方便只是命令行的形式，可以根據(jù)需求寫一個GUI界面）

package webCrawler.UI;

import java.util.Scanner;

import webCrawler.Index.SearchIndex;

/**
 * @author lannooo
 *
 */
public class UI {
  public static void main(String[] args) {
    /*輸入關(guān)鍵詞*/
    Scanner in = new Scanner(System.in);
    System.out.print("Search>>> ");
    String text = in.nextLine().trim();
    /*對于用戶的關(guān)鍵詞，如果是q則退出，長度為0也退出*/
    while(!text.equals("q") && text.length()>0){
      SearchIndex search = new SearchIndex("d:/index-spider2");
      int hits = search.search("text", text, 20);
      if(hits!=-1){
        search.printHits();
      }
      System.out.print("Search>>> ");
      text = in.nextLine().trim();
    }
  }
}

以上就是本文的全部內(nèi)容，希望對大家的學(xué)習(xí)有所幫助，也希望大家多多支持腳本之家。

您可能感興趣的文章: