From 51d384d54b4100c30435613e277c879e968ed171 Mon Sep 17 00:00:00 2001 From: Junjiu Date: Sat, 7 Jan 2017 16:40:44 -0600 Subject: [PATCH 1/2] delete the readme --- .project | 11 +++++ README.md | 119 ------------------------------------------------------ 2 files changed, 11 insertions(+), 119 deletions(-) create mode 100644 .project diff --git a/.project b/.project new file mode 100644 index 00000000..db60faaa --- /dev/null +++ b/.project @@ -0,0 +1,11 @@ + + + WebCollector + + + + + + + + diff --git a/README.md b/README.md index c82d5fba..e69de29b 100644 --- a/README.md +++ b/README.md @@ -1,119 +0,0 @@ -#WebCollector -WebCollector is an open source web crawler framework based on Java.It provides - some simple interfaces for crawling the Web,you can setup a - multi-threaded web crawler in less than 5 minutes. - - - - -##HomePage -[https://github.com/CrawlScript/WebCollector](https://github.com/CrawlScript/WebCollector) - -##Document -[WebCollector-GitDoc](https://github.com/CrawlScript/WebCollector-GitDoc) - - - -##Installation - -### Without Maven -WebCollector jars are available on the [HomePage](https://github.com/CrawlScript/WebCollector). - -+ __webcollector-version-bin.zip__ contains core jars. - - -##Quickstart -Lets crawl some news from hfut news.This demo prints out the titles and contents extracted from news of hfut news. - -[NewsCrawler.java](https://github.com/CrawlScript/WebCollector/blob/master/NewsCrawler.java): - -```java -import cn.edu.hfut.dmic.webcollector.model.CrawlDatums; -import cn.edu.hfut.dmic.webcollector.model.Page; -import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler; -import org.jsoup.nodes.Document; - -/** - * Crawling news from hfut news - * - * @author hu - */ -public class NewsCrawler extends BreadthCrawler { - - /** - * @param crawlPath crawlPath is the path of the directory which maintains - * information of this crawler - * @param autoParse if autoParse is true,BreadthCrawler will auto extract - * links which match regex rules from pag - */ - public NewsCrawler(String crawlPath, boolean autoParse) { - super(crawlPath, autoParse); - /*start page*/ - this.addSeed("http://news.hfut.edu.cn/list-1-1.html"); - - /*fetch url like http://news.hfut.edu.cn/show-xxxxxxhtml*/ - this.addRegex("http://news.hfut.edu.cn/show-.*html"); - /*do not fetch jpg|png|gif*/ - this.addRegex("-.*\\.(jpg|png|gif).*"); - /*do not fetch url contains #*/ - this.addRegex("-.*#.*"); - } - - @Override - public void visit(Page page, CrawlDatums next) { - String url = page.getUrl(); - /*if page is news page*/ - if (page.matchUrl("http://news.hfut.edu.cn/show-.*html")) { - /*we use jsoup to parse page*/ - Document doc = page.getDoc(); - - /*extract title and content of news by css selector*/ - String title = page.select("div[id=Article]>h2").first().text(); - String content = page.select("div#artibody", 0).text(); - - System.out.println("URL:\n" + url); - System.out.println("title:\n" + title); - System.out.println("content:\n" + content); - - /*If you want to add urls to crawl,add them to nextLink*/ - /*WebCollector automatically filters links that have been fetched before*/ - /*If autoParse is true and the link you add to nextLinks does not match the regex rules,the link will also been filtered.*/ - //next.add("http://xxxxxx.com"); - } - } - - public static void main(String[] args) throws Exception { - NewsCrawler crawler = new NewsCrawler("crawl", true); - crawler.setThreads(50); - crawler.setTopN(100); - //crawler.setResumable(true); - /*start crawl with depth of 4*/ - crawler.start(4); - } - -} -``` - - - -##Content Extraction -WebCollector could automatically extract content from news web-pages: - -```java -News news = ContentExtractor.getNewsByHtml(html, url); -News news = ContentExtractor.getNewsByHtml(html); -News news = ContentExtractor.getNewsByUrl(url); - -String content = ContentExtractor.getContentByHtml(html, url); -String content = ContentExtractor.getContentByHtml(html); -String content = ContentExtractor.getContentByUrl(url); - -Element contentElement = ContentExtractor.getContentElementByHtml(html, url); -Element contentElement = ContentExtractor.getContentElementByHtml(html); -Element contentElement = ContentExtractor.getContentElementByUrl(url); -``` - - -##Other Documentation - -+ [中文文档](https://github.com/CrawlScript/WebCollector/blob/master/README.zh-cn.md) From 365d541997fd89c46cdd34cb312518c15065116a Mon Sep 17 00:00:00 2001 From: Junjiu Date: Sat, 7 Jan 2017 16:52:40 -0600 Subject: [PATCH 2/2] Revert "delete the readme" This reverts commit 51d384d54b4100c30435613e277c879e968ed171. --- .project | 11 ----- README.md | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 11 deletions(-) delete mode 100644 .project diff --git a/.project b/.project deleted file mode 100644 index db60faaa..00000000 --- a/.project +++ /dev/null @@ -1,11 +0,0 @@ - - - WebCollector - - - - - - - - diff --git a/README.md b/README.md index e69de29b..c82d5fba 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,119 @@ +#WebCollector +WebCollector is an open source web crawler framework based on Java.It provides + some simple interfaces for crawling the Web,you can setup a + multi-threaded web crawler in less than 5 minutes. + + + + +##HomePage +[https://github.com/CrawlScript/WebCollector](https://github.com/CrawlScript/WebCollector) + +##Document +[WebCollector-GitDoc](https://github.com/CrawlScript/WebCollector-GitDoc) + + + +##Installation + +### Without Maven +WebCollector jars are available on the [HomePage](https://github.com/CrawlScript/WebCollector). + ++ __webcollector-version-bin.zip__ contains core jars. + + +##Quickstart +Lets crawl some news from hfut news.This demo prints out the titles and contents extracted from news of hfut news. + +[NewsCrawler.java](https://github.com/CrawlScript/WebCollector/blob/master/NewsCrawler.java): + +```java +import cn.edu.hfut.dmic.webcollector.model.CrawlDatums; +import cn.edu.hfut.dmic.webcollector.model.Page; +import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler; +import org.jsoup.nodes.Document; + +/** + * Crawling news from hfut news + * + * @author hu + */ +public class NewsCrawler extends BreadthCrawler { + + /** + * @param crawlPath crawlPath is the path of the directory which maintains + * information of this crawler + * @param autoParse if autoParse is true,BreadthCrawler will auto extract + * links which match regex rules from pag + */ + public NewsCrawler(String crawlPath, boolean autoParse) { + super(crawlPath, autoParse); + /*start page*/ + this.addSeed("http://news.hfut.edu.cn/list-1-1.html"); + + /*fetch url like http://news.hfut.edu.cn/show-xxxxxxhtml*/ + this.addRegex("http://news.hfut.edu.cn/show-.*html"); + /*do not fetch jpg|png|gif*/ + this.addRegex("-.*\\.(jpg|png|gif).*"); + /*do not fetch url contains #*/ + this.addRegex("-.*#.*"); + } + + @Override + public void visit(Page page, CrawlDatums next) { + String url = page.getUrl(); + /*if page is news page*/ + if (page.matchUrl("http://news.hfut.edu.cn/show-.*html")) { + /*we use jsoup to parse page*/ + Document doc = page.getDoc(); + + /*extract title and content of news by css selector*/ + String title = page.select("div[id=Article]>h2").first().text(); + String content = page.select("div#artibody", 0).text(); + + System.out.println("URL:\n" + url); + System.out.println("title:\n" + title); + System.out.println("content:\n" + content); + + /*If you want to add urls to crawl,add them to nextLink*/ + /*WebCollector automatically filters links that have been fetched before*/ + /*If autoParse is true and the link you add to nextLinks does not match the regex rules,the link will also been filtered.*/ + //next.add("http://xxxxxx.com"); + } + } + + public static void main(String[] args) throws Exception { + NewsCrawler crawler = new NewsCrawler("crawl", true); + crawler.setThreads(50); + crawler.setTopN(100); + //crawler.setResumable(true); + /*start crawl with depth of 4*/ + crawler.start(4); + } + +} +``` + + + +##Content Extraction +WebCollector could automatically extract content from news web-pages: + +```java +News news = ContentExtractor.getNewsByHtml(html, url); +News news = ContentExtractor.getNewsByHtml(html); +News news = ContentExtractor.getNewsByUrl(url); + +String content = ContentExtractor.getContentByHtml(html, url); +String content = ContentExtractor.getContentByHtml(html); +String content = ContentExtractor.getContentByUrl(url); + +Element contentElement = ContentExtractor.getContentElementByHtml(html, url); +Element contentElement = ContentExtractor.getContentElementByHtml(html); +Element contentElement = ContentExtractor.getContentElementByUrl(url); +``` + + +##Other Documentation + ++ [中文文档](https://github.com/CrawlScript/WebCollector/blob/master/README.zh-cn.md)