diff --git a/BUGS.md b/BUGS.md new file mode 100644 index 0000000..94fb682 --- /dev/null +++ b/BUGS.md @@ -0,0 +1,37 @@ +# bugs + +most bugs should be reported the the issues. However some are created by other pacakges such as Spooky and require +per-installation workarounds + +### quickscrape/tiny-jsonrpc bug + +The details will differ according to where `node` is installed. Here's PMR's: +``` +Error: Cannot find module '/usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/node_modules/spooky/lib/../node_modules/tiny-jsonrpc/lib/tiny-jsonrpc' so moving on to next url in list +Unsafe JavaScript attempt to access frame with URL about:blank from frame with URL file:///usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/node_modules/casperjs/bin/bootstrap.js. Domains, protocols and ports must match. +/usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/node_modules/eventemitter2/lib/eventemitter2.js:290 + throw arguments[1]; // Unhandled 'error' event + ^ + +Error: Child terminated with non-zero exit code 1 + at Spooky. (/usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/node_modules/spooky/lib/spooky.js:210:17) + at emitTwo (events.js:106:13) + at ChildProcess.emit (events.js:191:7) + at Process.ChildProcess._handle.onexit (internal/child_process.js:204:12) +``` +find where your quickscrape is: +``` +which quickscrape +gives: +/usr/local/n/versions/node/6.2.1/bin/quickscrape +create the top level dir +/usr/local/n/versions/node/6.2.1/ +other might have +/home/$USER/.nvm/versions/node/v6.3.1 + +``` +then copy files from the `lib` directory (after adjusting) +``` +cd /usr/local/n/versions/node/6.2.1/lib/node_modules/quickscrape/ +cp -r node_modules/tiny-jsonrpc node_modules/spooky/node_modules +``` diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..1d43b56 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,58 @@ +# Contributing to quickscrape + +Thank you for taking the time to contribute! :+1: + +This is a set of guidelines for contributing to quickscrape. You don't need to follow them as strict rules, use your best judgement and feel free to propose changes to this document as well via a pull request. + +#### Table of Contents + +[Basics](#basics) + +[How can I contribute?](#how-can-i-contribute) + +[Local testing](#local-testing) + +## Basics + +quickscrape is based on Node.js. If you want an introduction on how to work on a project like this, you can find a comprehensive tutorial [here](http://www.nodebeginner.org/). + +## How can I contribute? + +### Report bugs + +If you encounter a bug, please let us know. You can raise a new issue [here](https://github.com/ContentMine/quickscrape/issues). Please include as many information in your report as possible, to help maintainers reproduce the problem. + +* A clear and descriptive title +* Describe the exact steps which reproduce the problem, e.g. the query you entered. +* Describe the behaviour following those steps, and where the problem occurred. +* Explain where it was different from what you expected to happen. +* Attach additional information to the report, such as error messages, or corrupted files. +* Add a `bug` label to the issue. + +Before submitting a bug, please check the [list of existing bugs](https://github.com/ContentMine/quickscrape/issues?q=is%3Aopen+is%3Aissue+label%3Abug) whether there is a similar issue open. You can then help by adding your information to an existing report. + +### Fixing bugs or implementing new features + +If you're not sure where to start, have a look at issues that have a `help wanted` label - here is a [list](https://github.com/ContentMine/quickscrape/issues?utf8=%E2%9C%93&q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22+). + +### Suggesting features or changes + +There is always room for improvement and we'd like to hear your perspective on it. + +Before creating a pull request, please raise an issue to discuss the proposed changes first. We can then make sure to make best use of your efforts. + +## Local testing + +In order to set up your development environment for quickscrape, you need to install [Node.js](https://nodejs.org/en/). + +1. Create a fork on [github](https://help.github.com/articles/fork-a-repo/). + +1. Create a [new branch](https://www.atlassian.com/git/tutorials/using-branches/git-checkout) with a descriptive name. + +1. Work on your changes, and make regular commits to save them. + +1. Test your changes by running `npm install` within the repository and running gepapers with `npm bin/quickscrape.js`. + +1. When your changes work as intended, push them to your repository and [create a pull request](https://www.atlassian.com/git/tutorials/making-a-pull-request). + +1. We will then review the pull request and merge it as soon as possible. If problems arise, they will be discussed within the pull request. diff --git a/bin/quickscrape.js b/bin/quickscrape.js index bc80d22..3536886 100755 --- a/bin/quickscrape.js +++ b/bin/quickscrape.js @@ -11,7 +11,8 @@ var program = require('commander') , Scraper = thresher.Scraper , ep = require('../lib/eventparse.js') , loglevels = require('../lib/loglevels.js') - , outformat = require('../lib/outformat.js'); + , outformat = require('../lib/outformat.js') + , sanitize = require('sanitize-filename'); var pjson = require('../package.json'); @@ -35,13 +36,13 @@ program 'use a number instead of the URL to name output subdirectories') .option('-i, --ratelimit ', 'maximum number of scrapes per minute (default 3)', 3) - .option('-h --headless', + .option('-h, --headless', 'render all pages in a headless browser') .option('-l, --loglevel ', 'amount of information to log ' + '(silent, verbose, info*, data, warn, error, or debug)', 'info') - .option('-f, --outformat ', + .option('-g, --outformat ', 'JSON format to transform results into (currently only bibjson)') .option('-f, --logfile ', 'save log to specified file in output directory as well as printing to terminal') @@ -84,8 +85,9 @@ process.chdir(program.output); tld = process.cwd(); if (program.hasOwnProperty('logfile')) { + var logfilestream = fs.createWriteStream(program.logfile.toString()) log.add(winston.transports.File, { - filename: program.logfile, + stream: logfilestream, level: 'debug' }); log.info('Saving logs to ./' + program.output + '/' + program.logfile); @@ -223,7 +225,7 @@ var processUrl = function(url) { // url-specific output dir var dir = program.numberdirs ? ('' + i) : url.replace(/\/+/g, '_').replace(/:/g, ''); - dir = path.join(tld, dir); + dir = sanitize(path.join(tld, dir)); if (!fs.existsSync(dir)) { log.debug('creating output directory: ' + dir); fs.mkdirSync(dir); diff --git a/package.json b/package.json index e7626b3..6accc89 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,8 @@ "moment": "~2.10.2", "thresher": "^0.1.11", "which": "~1.0.5", - "winston": "~1.0.0" + "winston": "~1.0.0", + "sanitize-filename": "1.6.0" }, "bin": { "quickscrape": "bin/quickscrape.js"