From 51d384d54b4100c30435613e277c879e968ed171 Mon Sep 17 00:00:00 2001
From: Junjiu <junjie@iastate.edu>
Date: Sat, 7 Jan 2017 16:40:44 -0600
Subject: [PATCH 1/2] delete the readme

---
 .project  |  11 +++++
 README.md | 119 ------------------------------------------------------
 2 files changed, 11 insertions(+), 119 deletions(-)
 create mode 100644 .project
diff --git a/.project b/.project
new file mode 100644
index 00000000..db60faaa
--- /dev/null
+++ b/.project
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>WebCollector</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+	</buildSpec>
+	<natures>
+	</natures>
+</projectDescription>
diff --git a/README.md b/README.md
index c82d5fba..e69de29b 100644
--- a/README.md
+++ b/README.md
@@ -1,119 +0,0 @@
-#WebCollector
-WebCollector is an open source web crawler framework based on Java.It provides
-  some simple interfaces for crawling the Web,you can setup a
-  multi-threaded web crawler in less than 5 minutes.
-
-
-
-
-##HomePage
-[https://github.com/CrawlScript/WebCollector](https://github.com/CrawlScript/WebCollector)
-
-##Document
-[WebCollector-GitDoc](https://github.com/CrawlScript/WebCollector-GitDoc)
-
-
-
-##Installation
-
-### Without Maven
-WebCollector jars are available on the [HomePage](https://github.com/CrawlScript/WebCollector).
-
-+ __webcollector-version-bin.zip__ contains core jars.
-
-
-##Quickstart
-Lets crawl some news from hfut news.This demo prints out the titles and contents extracted from news of hfut news.
-
-[NewsCrawler.java](https://github.com/CrawlScript/WebCollector/blob/master/NewsCrawler.java):
-
-```java
-import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
-import cn.edu.hfut.dmic.webcollector.model.Page;
-import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
-import org.jsoup.nodes.Document;
-
-/**
- * Crawling news from hfut news
- *
- * @author hu
- */
-public class NewsCrawler extends BreadthCrawler {
-
-    /**
-     * @param crawlPath crawlPath is the path of the directory which maintains
-     * information of this crawler
-     * @param autoParse if autoParse is true,BreadthCrawler will auto extract
-     * links which match regex rules from pag
-     */
-    public NewsCrawler(String crawlPath, boolean autoParse) {
-        super(crawlPath, autoParse);
-        /*start page*/
-        this.addSeed("http://news.hfut.edu.cn/list-1-1.html");
-
-        /*fetch url like http://news.hfut.edu.cn/show-xxxxxxhtml*/
-        this.addRegex("http://news.hfut.edu.cn/show-.*html");
-        /*do not fetch jpg|png|gif*/
-        this.addRegex("-.*\\.(jpg|png|gif).*");
-        /*do not fetch url contains #*/
-        this.addRegex("-.*#.*");
-    }
-
-    @Override
-    public void visit(Page page, CrawlDatums next) {
-        String url = page.getUrl();
-        /*if page is news page*/
-        if (page.matchUrl("http://news.hfut.edu.cn/show-.*html")) {
-            /*we use jsoup to parse page*/
-            Document doc = page.getDoc();
-
-            /*extract title and content of news by css selector*/
-            String title = page.select("div[id=Article]>h2").first().text();
-            String content = page.select("div#artibody", 0).text();
-
-            System.out.println("URL:\n" + url);
-            System.out.println("title:\n" + title);
-            System.out.println("content:\n" + content);
-
-            /*If you want to add urls to crawl,add them to nextLink*/
-            /*WebCollector automatically filters links that have been fetched before*/
-            /*If autoParse is true and the link you add to nextLinks does not match the regex rules,the link will also been filtered.*/
-            //next.add("http://xxxxxx.com");
-        }
-    }
-
-    public static void main(String[] args) throws Exception {
-        NewsCrawler crawler = new NewsCrawler("crawl", true);
-        crawler.setThreads(50);
-        crawler.setTopN(100);
-        //crawler.setResumable(true);
-        /*start crawl with depth of 4*/
-        crawler.start(4);
-    }
-
-}
-```
-    
-
-
-##Content Extraction
-WebCollector could automatically extract content from news web-pages:
-
-```java
-News news = ContentExtractor.getNewsByHtml(html, url);
-News news = ContentExtractor.getNewsByHtml(html);
-News news = ContentExtractor.getNewsByUrl(url);
-
-String content = ContentExtractor.getContentByHtml(html, url);
-String content = ContentExtractor.getContentByHtml(html);
-String content = ContentExtractor.getContentByUrl(url);
-
-Element contentElement = ContentExtractor.getContentElementByHtml(html, url);
-Element contentElement = ContentExtractor.getContentElementByHtml(html);
-Element contentElement = ContentExtractor.getContentElementByUrl(url);
-```
-
-
-##Other Documentation
-
-+ [中文文档](https://github.com/CrawlScript/WebCollector/blob/master/README.zh-cn.md)

From 365d541997fd89c46cdd34cb312518c15065116a Mon Sep 17 00:00:00 2001
From: Junjiu <junjie@iastate.edu>
Date: Sat, 7 Jan 2017 16:52:40 -0600
Subject: [PATCH 2/2] Revert "delete the readme"

This reverts commit 51d384d54b4100c30435613e277c879e968ed171.
---
 .project  |  11 -----
 README.md | 119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 119 insertions(+), 11 deletions(-)
 delete mode 100644 .project

diff --git a/.project b/.project
deleted file mode 100644
index db60faaa..00000000
--- a/.project
+++ /dev/null
@@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>WebCollector</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-	</buildSpec>
-	<natures>
-	</natures>
-</projectDescription>
diff --git a/README.md b/README.md
index e69de29b..c82d5fba 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,119 @@
+#WebCollector
+WebCollector is an open source web crawler framework based on Java.It provides
+  some simple interfaces for crawling the Web,you can setup a
+  multi-threaded web crawler in less than 5 minutes.
+
+
+
+
+##HomePage
+[https://github.com/CrawlScript/WebCollector](https://github.com/CrawlScript/WebCollector)
+
+##Document
+[WebCollector-GitDoc](https://github.com/CrawlScript/WebCollector-GitDoc)
+
+
+
+##Installation
+
+### Without Maven
+WebCollector jars are available on the [HomePage](https://github.com/CrawlScript/WebCollector).
+
++ __webcollector-version-bin.zip__ contains core jars.
+
+
+##Quickstart
+Lets crawl some news from hfut news.This demo prints out the titles and contents extracted from news of hfut news.
+
+[NewsCrawler.java](https://github.com/CrawlScript/WebCollector/blob/master/NewsCrawler.java):
+
+```java
+import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
+import cn.edu.hfut.dmic.webcollector.model.Page;
+import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
+import org.jsoup.nodes.Document;
+
+/**
+ * Crawling news from hfut news
+ *
+ * @author hu
+ */
+public class NewsCrawler extends BreadthCrawler {
+
+    /**
+     * @param crawlPath crawlPath is the path of the directory which maintains
+     * information of this crawler
+     * @param autoParse if autoParse is true,BreadthCrawler will auto extract
+     * links which match regex rules from pag
+     */
+    public NewsCrawler(String crawlPath, boolean autoParse) {
+        super(crawlPath, autoParse);
+        /*start page*/
+        this.addSeed("http://news.hfut.edu.cn/list-1-1.html");
+
+        /*fetch url like http://news.hfut.edu.cn/show-xxxxxxhtml*/
+        this.addRegex("http://news.hfut.edu.cn/show-.*html");
+        /*do not fetch jpg|png|gif*/
+        this.addRegex("-.*\\.(jpg|png|gif).*");
+        /*do not fetch url contains #*/
+        this.addRegex("-.*#.*");
+    }
+
+    @Override
+    public void visit(Page page, CrawlDatums next) {
+        String url = page.getUrl();
+        /*if page is news page*/
+        if (page.matchUrl("http://news.hfut.edu.cn/show-.*html")) {
+            /*we use jsoup to parse page*/
+            Document doc = page.getDoc();
+
+            /*extract title and content of news by css selector*/
+            String title = page.select("div[id=Article]>h2").first().text();
+            String content = page.select("div#artibody", 0).text();
+
+            System.out.println("URL:\n" + url);
+            System.out.println("title:\n" + title);
+            System.out.println("content:\n" + content);
+
+            /*If you want to add urls to crawl,add them to nextLink*/
+            /*WebCollector automatically filters links that have been fetched before*/
+            /*If autoParse is true and the link you add to nextLinks does not match the regex rules,the link will also been filtered.*/
+            //next.add("http://xxxxxx.com");
+        }
+    }
+
+    public static void main(String[] args) throws Exception {
+        NewsCrawler crawler = new NewsCrawler("crawl", true);
+        crawler.setThreads(50);
+        crawler.setTopN(100);
+        //crawler.setResumable(true);
+        /*start crawl with depth of 4*/
+        crawler.start(4);
+    }
+
+}
+```
+    
+
+
+##Content Extraction
+WebCollector could automatically extract content from news web-pages:
+
+```java
+News news = ContentExtractor.getNewsByHtml(html, url);
+News news = ContentExtractor.getNewsByHtml(html);
+News news = ContentExtractor.getNewsByUrl(url);
+
+String content = ContentExtractor.getContentByHtml(html, url);
+String content = ContentExtractor.getContentByHtml(html);
+String content = ContentExtractor.getContentByUrl(url);
+
+Element contentElement = ContentExtractor.getContentElementByHtml(html, url);
+Element contentElement = ContentExtractor.getContentElementByHtml(html);
+Element contentElement = ContentExtractor.getContentElementByUrl(url);
+```
+
+
+##Other Documentation
+
++ [中文文档](https://github.com/CrawlScript/WebCollector/blob/master/README.zh-cn.md)