Skip to content
This repository was archived by the owner on Sep 21, 2021. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
node_modules
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ JavaScript port of [TextStatistics.php](https://github.com/DaveChild/Text-Statis
I've done what I think is a reasonably faithful port. Documentation incoming!
I removed a lot of the original comments during the port, but seeing as the API remained largely the same, I'll add them in shortly.

Same goes for a test suite - I'll get something working in node in a bit. :)
The beginning of a test suite in [Mocha](https://mochajs.org/) is here, covering cleaning the text and some cases of word and sentence counting.

## Installation

Expand Down
28 changes: 15 additions & 13 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,23 @@

fullStopTags.forEach(function(tag) {
text = text.replace("</" + tag + ">",".");
})
});

text = text
.replace(/<[^>]+>/g, "") // Strip tags
.replace(/[,:;()\-]/, " ") // Replace commans, hyphens etc (count them as spaces)
.replace(/[\.!?]/, ".") // Unify terminators
.replace(/^\s+/,"") // Strip leading whitespace
.replace(/[ ]*(\n|\r\n|\r)[ ]*/," ") // Replace new lines with spaces
.replace(/([\.])[\. ]+/,".") // Check for duplicated terminators
.replace(/[ ]*([\.])/,". ") // Pad sentence terminators
.replace(/\s+/," ") // Remove multiple spaces
.replace(/\s+$/,""); // Strip trailing whitespace
.replace(/[,:;()\/&+]|\-\-/g, " ") // Replace commas, hyphens etc (count them as spaces)
.replace(/[\.!?]/g, ".") // Unify terminators
.replace(/^\s+/, "") // Strip leading whitespace
.replace(/[\.]?(\w+)[\.]?(\w+)@(\w+)[\.](\w+)[\.]?/g, "$1$2@$3$4") // strip periods in email addresses (so they remain counted as one word)
.replace(/[ ]*(\n|\r\n|\r)[ ]*/g, ".") // Replace new lines with periods
.replace(/([\.])[\.]+/g, ".") // Check for duplicated terminators
.replace(/[ ]*([\.])/g, ". ") // Pad sentence terminators
.replace(/\s+/g, " ") // Remove multiple spaces
.replace(/\s+$/, ""); // Strip trailing whitespace

text += "."; // Add final terminator, just in case it's missing.

if(text.slice(-1) != '.') {
text += "."; // Add final terminator, just in case it's missing.
}
return text;
}

Expand Down Expand Up @@ -84,7 +86,7 @@

TextStatistics.prototype.wordCount = function(text) {
text = text ? cleanText(text) : this.text;
return text.split(/[^a-z0-9]+/i).length || 1;
return text.split(/[^a-z0-9\'@\.\-]+/i).length || 1;
};

TextStatistics.prototype.averageWordsPerSentence = function(text) {
Expand Down Expand Up @@ -210,7 +212,7 @@
wordPartCount = word
.split(/[^aeiouy]+/ig)
.filter(function(wordPart) {
return !!wordPart.replace(/\s+/ig,"").length
return !!wordPart.replace(/\s+/ig,"").length;
})
.length;

Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
},
"main": "index.js",
"dependencies": {},
"devDependencies": {},
"devDependencies": {
"mocha": "^3.0.2"
},
"optionalDependencies": {}
}
1 change: 1 addition & 0 deletions test/mocha.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--reporter nyan
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Heh.

181 changes: 181 additions & 0 deletions test/testCleanText.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
var assert = require('assert');
var TextStatistics = require('../index.js');

describe('TextStatistics', function() {
// this is called when you "make" a TextStatistics
describe('#cleanText()', function() {
it('should add a final terminator if it\'s missing', function() {
var ts = TextStatistics('Hello friend');
assert.equal(ts.text, 'Hello friend.');
});

it('should not add a final terminator if there is a \'.\'', function() {
var ts = TextStatistics('Hello friend.');
assert.equal(ts.text, 'Hello friend.');
});

context('trailing whitespace', function() {
it('should strip spaces', function() {
var ts = TextStatistics('Hello friend. ');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip newlines', function() {
var ts = TextStatistics('Hello friend.\n\n');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip \\r\\n thing', function() {
var ts = TextStatistics('Hello friend.\r\n');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip tabs', function() {
var ts = TextStatistics('Hello friend.\t');
assert.equal(ts.text, 'Hello friend.');
});
});

context('leading whitespace', function() {
it('should strip spaces', function() {
var ts = TextStatistics(' Hello friend.');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip newlines', function() {
var ts = TextStatistics('\n\nHello friend.');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip \\r\\n thing', function() {
var ts = TextStatistics('\r\nHello friend.');
assert.equal(ts.text, 'Hello friend.');
});

it('should strip tabs', function() {
var ts = TextStatistics('\tHello friend.');
assert.equal(ts.text, 'Hello friend.');
});
});

it('should remove multiple spaces between words', function() {
var ts = TextStatistics('Hello good friend.');
assert.equal(ts.text, 'Hello good friend.');
});

it('should un-duplicate terminators', function() {
var ts = TextStatistics('Hello... Friend..');
assert.equal(ts.text, 'Hello. Friend.');
});

it('should pad terminators with a space', function() {
var ts = TextStatistics('Hello.Good.Friend.');
assert.equal(ts.text, 'Hello. Good. Friend.');
});

context('unify terminators', function() {
it('should replace all !! with ..', function() {
var ts = TextStatistics('Hello! Friend!');
assert.equal(ts.text, 'Hello. Friend.');
});

it('should replace all ?? with ..', function() {
var ts = TextStatistics('Hello? Friend?');
assert.equal(ts.text, 'Hello. Friend.');
});
});

context('replacing newlines with terminators', function() {
it('should replace \\n', function() {
var ts = TextStatistics('bulleted list here we go\nnice dog\ngood dog');
assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.');
});

it('should replace \\r\\n', function() {
var ts = TextStatistics('bulleted list here we go\r\nnice dog\r\ngood dog');
assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.');
});

it('should replace \\r', function() {
var ts = TextStatistics('bulleted list here we go\rnice dog\rgood dog');
assert.equal(ts.text, 'bulleted list here we go. nice dog. good dog.');
});
});

context('stripping periods from email addresses', function() {
it('should replace a single period', function() {
var ts = TextStatistics('[email protected]');
assert.equal(ts.text, 'textstatistics@examplecom.');
});

it('should replace a single period in the first part', function() {
var ts = TextStatistics('[email protected]');
assert.equal(ts.text, 'textstatistics@examplecom.');
});

it('should replace two periods in the first part', function() {
var ts = TextStatistics('[email protected]');
assert.equal(ts.text, 'textstatistics@examplecom.');
});

it('should replace periods with a subdomain', function() {
var ts = TextStatistics('[email protected]');
assert.equal(ts.text, 'textstatistics@testexamplecom.');
});

it('should replace periods with a subdomain and before the @', function() {
var ts = TextStatistics('[email protected]');
assert.equal(ts.text, 'textstatistics@testexamplecom.');
});
});

context('replacing non-terminator punctuation', function() {
it('should replace commas with spaces', function() {
var ts = TextStatistics('Hello, hi, friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace colons with spaces', function() {
var ts = TextStatistics('Hello: hi: friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace semicolons with spaces', function() {
var ts = TextStatistics('Hello; hi; friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace parentheses with spaces', function() {
var ts = TextStatistics('(Hello (hi) friend).');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace slashes with spaces', function() {
var ts = TextStatistics('Hello/hi/friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace double hyphens with spaces', function() {
var ts = TextStatistics('Hello--hi--friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should not replace a single dash with spaces', function() {
var ts = TextStatistics('Hi-di-ho friend-person!');
assert.equal(ts.text, 'Hi-di-ho friend-person.');
});

it('should replace pluses with spaces', function() {
var ts = TextStatistics('Hello + hi+friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace ampersands with spaces', function() {
var ts = TextStatistics('Hello&hi & friend.');
assert.equal(ts.text, 'Hello hi friend.');
});

it('should replace em-dash with spaces'); // can I do that?
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Em dash is usually used as a word terminator in my experience, rather than to create hyphenated pairs, so I don't see why not!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, my comment there is more about figuring out where to get the emdash. I was apparently in too much of a hurry to copy/paste it.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, cool. Sorry, didn't intend to come across as patronising! Apologies if that was the case.

Not sure what OS you use, but I love em-dash, and there's a nice shortcut for it on Mac OS — option+shift+dash. Option+dash will give you an en-dash. :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries, I didn't read it as patronizing. My comment wasn't super clear.

});
});
});
54 changes: 54 additions & 0 deletions test/testCountMethods.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
var assert = require('assert');
var TextStatistics = require('../index.js');

describe('TextStatistics', function() {

describe('#sentenceCount()', function() {
it('should count a single sentence', function() {
var ts = TextStatistics('see spot run.');
assert.equal(1, ts.sentenceCount());
});

it('should count a single sentence with a comma', function() {
var ts = TextStatistics('see, spot runs.');
assert.equal(1, ts.sentenceCount());
});

it('should count a few simple sentences', function() {
var ts = TextStatistics('see spot run. good job spot. have a treat.');
assert.equal(3, ts.sentenceCount());
});
});

describe('#wordCount()', function() {
it('a string w/o words should have word count of one, because dividing by zero', function() {
var ts = TextStatistics('.');
assert.equal(1, ts.wordCount());
});

it('should count the number of words in a text', function() {
var ts = TextStatistics('see spot run');
assert.equal(3, ts.wordCount());
});

it('should not count words with an apostrophe as two words', function() {
var ts = TextStatistics('they\'re');
assert.equal(1, ts.wordCount());
});

it('should not count the empty string after a period as a word', function() {
var ts = TextStatistics('dog.');
assert.equal(1, ts.wordCount());
});

it('should count an email address as a single word', function() {
var ts = TextStatistics('[email protected]');
assert.equal(1, ts.wordCount());
});

it('should count words with a dash as a single word', function() {
var ts = TextStatistics('long-term');
assert.equal(1, ts.wordCount());
});
});
});