如何实现Java网络爬虫?
传统的爬虫从一个或几个初始网页的URL开始,获取初始网页上的URL。在爬取网页的过程中,它不断地从当前页面中提取新的URL并放入队列中,直到满足系统的某些停止条件。对于垂直搜索来说,更适合关注爬虫,即抓取特定主题网页的爬虫。
以下是用java实现的简单爬虫核心代码:
公共void爬网()抛出Throwable {
while (continueCrawling()) {
crawler URL URL = get nexturl();//获取队列中要爬网的下一个URL。
如果(网址!= null) {
printCrawlInfo();
string content = get content(URL);//获取URL的文本信息
//聚焦爬虫只抓取与主题内容相关的网页,这里采用规则匹配进行简单处理。
if(iscontentrelevent(content,this.regexpSearchPattern)) {
saveContent(url,内容);//将网页保存到本地。
//获取网页内容中的链接,放入队列中进行抓取。
集合urlStrings = extractUrls(内容,URL);
addUrlsToUrlQueue(url,URL strings);
}否则{
System.out.println(url +"不相关忽略...");
}
//延时防止被对方屏蔽
thread . sleep(this . delaybetweenurls);
}
}
closeOutputStream();
}
private CrawlerUrl getNextUrl()抛出Throwable {
CrawlerUrl nextUrl = null
while((nextUrl = = null)& amp;& amp(!urlQueue.isEmpty())) {
crawler URL crawler URL = this . URL queue . remove();
//doWeHavePermissionToVisit:无论你是否有访问该URL的权限,一个友好的爬虫都会按照网站提供的“Robot.txt”中配置的规则进行爬行。
//IsurlReadyVisited:URL是否被访问过?大型搜索引擎经常使用BloomFilter来复制它。这里,简单地使用HashMap。
///isdepthaceptable:是否已达到指定的深度上限。爬行动物通常采用广度优先的方法。有些网站会构建爬虫陷阱(自动生成一些无效链接使爬虫陷入死循环)并采取深度限制来规避。
if(doWeHavePermissionToVisit(crawler URL)
& amp& amp(!isurlareadyvisited(crawler URL))
& amp& ampis depth acceptable(crawler URL)){
nextUrl = crawlerUrl
// System.out.println("下一个要访问的url是"Next url
}
}
返回nextUrl
}
私有字符串getContent(CrawlerUrl)引发Throwable {
//HttpClient4.1的调用方式与之前不同。
http client client = new default http client();
http get http get = new http get(URL . geturlstring());
string buffer strBuf = new string buffer();
HttpResponse response = client . execute(http get);
if (HttpStatus。SC_OK == response.getStatusLine()。getStatusCode()) {
HttpEntity entity = response . get entity();
如果(实体!= null) {
BufferedReader reader =新BufferedReader(
new InputStreamReader(entity . get content(),“UTF-8”));
String line = null
if(entity . get contentlength()& gt;0) {
strBuf = new string buffer((int)entity . get contentlength());
while ((line = reader.readLine())!= null) {
strBuf.append(行);
}
}
}
如果(实体!= null) {
nsumeContent();
}
}
//将url标记为已访问。
markUrlAsVisited(URL);
返回strbuf . tostring();
}
public static boolean is content relevant(字符串内容,
模式regexpPattern) {
布尔返回值=假;
如果(内容!= null) {
//符合正则表达式的条件吗?
matcher m = regexppattern . matcher(content . tolowercase());
retValue = m . find();
}
返回retValue
}
公共列表提取Url(字符串文本,CrawlerUrl crawlerUrl) {
map URL map = new HashMap();
extrachttpurls(URL map,text);
extractreativeurls(URL map,text,crawler URL);
返回新的ArrayList(URL map . keyset());
}
私有void extractHttpUrls(映射urlMap,字符串文本){
matcher m =(text);
while (m.find()) {
string URL = m . group();
string[]terms = URL . split(" a href = \ ");
for(字符串术语:术语){
//system . out . println(" Term = "+Term);
if (term.startsWith("http")) {
int index = term . index of(" \ ");
if(index & gt;0) {
term = term.substring(0,索引);
}
urlMap.put(term,term);
System.out.println("超链接:"+term);
}
}
}
}
private void extractRelativeUrls(映射urlMap,字符串文本,
爬行动物(爬行动物){
matcher m = relative regexp . matcher(text);
URL textURL = crawler URL . geturl();
string host = texturl . gethost();
while (m.find()) {
string URL = m . group();
string[]terms = URL . split(" a href = \ ");
for(字符串术语:术语){
if(term . starts with("/"){
int index = term . index of(" \ ");
if(index & gt;0) {
term = term.substring(0,索引);
}
字符串s =//+主机+术语;
urlMap.put(s,s);
System.out.println("相对URL:"+s);
}
}
}
}
公共静态void main(String[] args) {
尝试{
字符串url =
queue URL queue = new linked list();
String regexp = " java
urlQueue.add(new CrawlerUrl(url,0));
naive crawler crawler = new naive crawler(URL queue,100,5,1000L,
regexp);
//boolean allow crawl = crawler . areweallowedtovisit(URL);
// System.out.println("允许爬网:"+ url + " " +
//allow crawl);
crawler .爬网();
} catch (Throwable t) {
system . out . println(t . tostring());
t . printstacktrace();
}
}