From c126c87818eb06aa5c2ac23b362d504f342c72b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 15 Mar 2016 00:22:02 +0200 Subject: [PATCH 01/22] add language files --- .../danish.txt | 94 ++++++ .../dutch.txt | 101 ++++++ .../english.txt | 319 ++++++++++++++++++ .../finnish.txt | 235 +++++++++++++ .../french.txt | 155 +++++++++ .../german.txt | 231 +++++++++++++ .../hungarian.txt | 199 +++++++++++ .../italian.txt | 279 +++++++++++++++ .../norwegian.txt | 176 ++++++++++ .../portuguese.txt | 203 +++++++++++ .../russian.txt | 151 +++++++++ .../spanish.txt | 313 +++++++++++++++++ .../swedish.txt | 114 +++++++ .../turkish.txt | 53 +++ 14 files changed, 2623 insertions(+) create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt new file mode 100644 index 000000000000..d3edc6757912 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt @@ -0,0 +1,94 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt new file mode 100644 index 000000000000..cafa0324b537 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt @@ -0,0 +1,101 @@ +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt new file mode 100644 index 000000000000..61e5350dcde3 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt @@ -0,0 +1,319 @@ +a +about +above +across +after +afterwards +again +against +all +almost +alone +along +already +also +although +always +am +among +amongst +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anyway +anywhere +are +around +as +at +back +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +below +beside +besides +between +beyond +bill +both +bottom +but +by +call +can +cannot +cant +co +computer +con +could +couldnt +cry +de +describe +detail +do +done +down +due +during +each +eg +eight +either +eleven +else +elsewhere +empty +enough +etc +even +ever +every +everyone +everything +everywhere +except +few +fifteen +fify +fill +find +fire +first +five +for +former +formerly +forty +found +four +from +front +full +further +get +give +go +had +has +hasnt +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +herself +him +himself +his +how +however +hundred +i +ie +if +in +inc +indeed +interest +into +is +it +its +itself +keep +last +latter +latterly +least +less +ltd +made +many +may +me +meanwhile +might +mill +mine +more +moreover +most +mostly +move +much +must +my +myself +name +namely +neither +never +nevertheless +next +nine +no +nobody +none +noone +nor +not +nothing +now +nowhere +of +off +often +on +once +one +only +onto +or +other +others +otherwise +our +ours +ourselves +out +over +own +part +per +perhaps +please +put +rather +re +same +see +seem +seemed +seeming +seems +serious +several +she +should +show +side +since +sincere +six +sixty +so +some +somehow +someone +something +sometime +sometimes +somewhere +still +such +system +take +ten +than +that +the +their +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +thereupon +these +they +thick +thin +third +this +those +though +three +through +throughout +thru +thus +to +together +too +top +toward +towards +twelve +twenty +two +un +under +until +up +upon +us +very +via +was +we +well +were +what +whatever +when +whence +whenever +where +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +whoever +whole +whom +whose +why +will +with +within +without +would +yet +you +your +yours +yourself +yourselves diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt new file mode 100644 index 000000000000..47ee200f6781 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt @@ -0,0 +1,235 @@ +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt new file mode 100644 index 000000000000..e7cbf4c97500 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt @@ -0,0 +1,155 @@ +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt new file mode 100644 index 000000000000..edef220b7a7d --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt @@ -0,0 +1,231 @@ +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unse +unsem +unsen +unser +unses +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt new file mode 100644 index 000000000000..94e9f9a0b07a --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt @@ -0,0 +1,199 @@ +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt new file mode 100644 index 000000000000..6ee02b51fb17 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt @@ -0,0 +1,279 @@ +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt new file mode 100644 index 000000000000..9ac1abbb6cba --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt @@ -0,0 +1,176 @@ +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt new file mode 100644 index 000000000000..6b2477863b7b --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt @@ -0,0 +1,203 @@ +de +a +o +que +e +do +da +em +um +para +com +não +uma +os +no +se +na +por +mais +as +dos +como +mas +ao +ele +das +à +seu +sua +ou +quando +muito +nos +já +eu +também +só +pelo +pela +até +isso +ela +entre +depois +sem +mesmo +aos +seus +quem +nas +me +esse +eles +você +essa +num +nem +suas +meu +às +minha +numa +pelos +elas +qual +nós +lhe +deles +essas +esses +pelas +este +dele +tu +te +vocês +vos +lhes +meus +minhas +teu +tua +teus +tuas +nosso +nossa +nossos +nossas +dela +delas +esta +estes +estas +aquele +aquela +aqueles +aquelas +isto +aquilo +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt new file mode 100644 index 000000000000..ecb83d4a7f39 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt @@ -0,0 +1,151 @@ +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt new file mode 100644 index 000000000000..59bc786caa49 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt @@ -0,0 +1,313 @@ +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosostros +vosostras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt new file mode 100644 index 000000000000..742bb6263b99 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt @@ -0,0 +1,114 @@ +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt new file mode 100644 index 000000000000..5a48ccce0737 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt @@ -0,0 +1,53 @@ +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani From 8248579ec27a40de98fe1f3020d947c478981ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 15 Mar 2016 00:23:32 +0200 Subject: [PATCH 02/22] add multi-language support for stop words --- .../spark/ml/feature/StopWordsRemover.scala | 179 ++++++++++-------- 1 file changed, 105 insertions(+), 74 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 0d4c96863329..e7f1d8323376 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -19,71 +19,49 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, StringArrayParam} import org.apache.spark.ml.util._ import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructType} /** - * stop words list - */ + * stop words list + */ private[spark] object StopWords { - /** - * Use the same default stopwords list as scikit-learn. - * The original list can be found from "Glasgow Information Retrieval Group" - * [[http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words]] - */ - val English = Array( "a", "about", "above", "across", "after", "afterwards", "again", - "against", "all", "almost", "alone", "along", "already", "also", "although", "always", - "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", - "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", - "around", "as", "at", "back", "be", "became", "because", "become", - "becomes", "becoming", "been", "before", "beforehand", "behind", "being", - "below", "beside", "besides", "between", "beyond", "bill", "both", - "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", - "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", - "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", - "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", - "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", - "find", "fire", "first", "five", "for", "former", "formerly", "forty", - "found", "four", "from", "front", "full", "further", "get", "give", "go", - "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", - "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", - "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", - "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", - "latterly", "least", "less", "ltd", "made", "many", "may", "me", - "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", - "move", "much", "must", "my", "myself", "name", "namely", "neither", - "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", - "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", - "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", - "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", - "please", "put", "rather", "re", "same", "see", "seem", "seemed", - "seeming", "seems", "serious", "several", "she", "should", "show", "side", - "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", - "something", "sometime", "sometimes", "somewhere", "still", "such", - "system", "take", "ten", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", - "therefore", "therein", "thereupon", "these", "they", "thick", "thin", - "third", "this", "those", "though", "three", "through", "throughout", - "thru", "thus", "to", "together", "too", "top", "toward", "towards", - "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", - "very", "via", "was", "we", "well", "were", "what", "whatever", "when", - "whence", "whenever", "where", "whereafter", "whereas", "whereby", - "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", - "who", "whoever", "whole", "whom", "whose", "why", "will", "with", - "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves") + def readStopWords(language: String): Array[String] = { + val is = getClass.getResourceAsStream(s"/$language.txt") + scala.io.Source.fromInputStream(is).getLines().toArray + } + + lazy val Danish = readStopWords("/danish.txt") + lazy val Dutch = readStopWords("/dutch.txt") + lazy val English = readStopWords("/english.txt") + lazy val Finnish = readStopWords("/finnish.txt") + lazy val French = readStopWords("/french.txt") + lazy val German = readStopWords("/german.txt") + lazy val Hungarian = readStopWords("/hungarian.txt") + lazy val Italian = readStopWords("/italian.txt") + lazy val Norwegian = readStopWords("/norwegian.txt") + lazy val Portuguese = readStopWords("/portuguese.txt") + lazy val Russian = readStopWords("/russian.txt") + lazy val Spanish = readStopWords("/spanish.txt") + lazy val Swedish = readStopWords("/swedish.txt") + lazy val Turkish = readStopWords("/turkish.txt") + + val languageMap = Map("danish" -> Danish, "dutch" -> Dutch, "english" -> English, + "finnish" -> Finnish, "french" -> French, "german" -> German, "hungarian" -> Hungarian, + "italian" -> Italian, "norwegian" -> Norwegian, "portuguese" -> Portuguese, + "russian" -> Russian, "spanish" -> Spanish, "swedish" -> Swedish, "turkish" -> Turkish) } /** - * :: Experimental :: - * A feature transformer that filters out stop words from input. - * Note: null values from input array are preserved unless adding null to stopWords explicitly. - * @see [[http://en.wikipedia.org/wiki/Stop_words]] - */ + * :: Experimental :: + * A feature transformer that filters out stop words from input. + * Note: null values from input array are preserved unless adding null to stopWords explicitly. + * @see [[http://en.wikipedia.org/wiki/Stop_words]] + */ @Experimental class StopWordsRemover(override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { @@ -97,23 +75,26 @@ class StopWordsRemover(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * the stop words set to be filtered out - * Default: [[StopWords.English]] - * @group param - */ + * the stop words set to be filtered out + * Default: [[StopWords.English]] + * @group param + */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") /** @group setParam */ - def setStopWords(value: Array[String]): this.type = set(stopWords, value) + def setStopWords(value: Array[String]): this.type = { + set(stopWords, value) + set(language, "unknown") + } /** @group getParam */ def getStopWords: Array[String] = $(stopWords) /** - * whether to do a case sensitive comparison over the stop words - * Default: false - * @group param - */ + * whether to do a case sensitive comparison over the stop words + * Default: false + * @group param + */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", "whether to do case-sensitive comparison during filtering") @@ -123,21 +104,71 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getCaseSensitive: Boolean = $(caseSensitive) - setDefault(stopWords -> StopWords.English, caseSensitive -> false) + /** + * the language of stop words + * Default: "english" + * @group param + */ + val language: Param[String] = new Param[String](this, "language", "stopwords language") + + /** @group setParam */ + def setLanguage(value: String): this.type = { + val lang = value.toLowerCase + require(StopWords.languageMap.contains(lang), s"$lang is not in language list") + set(language, lang) + set(stopWords, StopWords.languageMap(lang)) + } + + /** @group getParam */ + def getLanguage: String = $(language) + + /** + * the ignored stop words set to be ignored out + * Default: [[Array.empty]] + * @group param + */ + val ignoredWords: StringArrayParam = new StringArrayParam(this, "ignoredWords", + "the ignored stop words set to be ignored out") + + /** @group setParam */ + def setIgnoredWords(value: Array[String]): this.type = set(ignoredWords, value) + + /** @group getParam */ + def getIgnoredWords: Array[String] = $(ignoredWords) + + /** + * the additional stop words set to be filtered out + * Default: [[Array.empty]] + * @group param + */ + val additionalWords: StringArrayParam = new StringArrayParam(this, "additionalWords", + "the additional stop words set to be filtered out") + + /** @group setParam */ + def setAdditionalWords(value: Array[String]): this.type = set(additionalWords, value) + + /** @group getParam */ + def getAdditionalWords: Array[String] = $(additionalWords) + + setDefault(stopWords -> StopWords.English, language -> "en", ignoredWords -> Array.empty[String] + , additionalWords -> Array.empty[String], caseSensitive -> false) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val t = if ($(caseSensitive)) { - val stopWordsSet = $(stopWords).toSet - udf { terms: Seq[String] => - terms.filter(s => !stopWordsSet.contains(s)) - } - } else { - val toLower = (s: String) => if (s != null) s.toLowerCase else s - val lowerStopWords = $(stopWords).map(toLower(_)).toSet - udf { terms: Seq[String] => - terms.filter(s => !lowerStopWords.contains(toLower(s))) - } + val stopWordsSet = ($(stopWords) ++ $(additionalWords)).toSet - $(ignoredWords).toSet + udf { terms: Seq[String] => + terms.filter(s => !stopWordsSet.contains(s)) + } + } else { + val toLower = (s: String) => if (s != null) s.toLowerCase else s + val lowerStopWords = { + ($(stopWords) ++ $(additionalWords)) + .map(toLower(_)).toSet - $(ignoredWords).map(toLower(_)).toSet + } + udf { terms: Seq[String] => + terms.filter(s => !lowerStopWords.contains(toLower(s))) + } } val metadata = outputSchema($(outputCol)).metadata From 2c7b73df14d2d292eff88d7f3c358d29f82f6122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 15 Mar 2016 00:24:41 +0200 Subject: [PATCH 03/22] add new tests for StopWordsRemover --- .../ml/feature/StopWordsRemoverSuite.scala | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index a5b24c18565b..92c177ad6861 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -67,12 +67,26 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } + test("StopWordsRemover with ignored words") { + val ignoredWords = Array("a") + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + .setIgnoredWords(ignoredWords) + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("python", "scala", "a"), Seq("python", "scala", "a")), + (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } + test("StopWordsRemover with additional words") { - val stopWords = StopWords.English ++ Array("python", "scala") + val additionalWords = Array("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setStopWords(stopWords) + .setAdditionalWords(additionalWords) val dataSet = sqlContext.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq()), (Seq("Python", "Scala", "swift"), Seq("swift")) @@ -81,6 +95,19 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } + test("StopWordsRemover with language selection") { + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + .setLanguage("turkish") + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("acaba", "ama", "biri"), Seq()), + (Seq("hep", "her", "scala"), Seq("scala")) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } + test("read/write") { val t = new StopWordsRemover() .setInputCol("myInputCol") From 43e5cf54d4f9583f8b90291b3c7603ac4e7fab2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 01:41:47 +0200 Subject: [PATCH 04/22] adjust resource files --- .../spark/ml/feature/stopwords/danish.txt | 94 ++++++ .../spark/ml/feature/stopwords/dutch.txt | 101 ++++++ .../spark/ml/feature/stopwords/english.txt | 319 ++++++++++++++++++ .../spark/ml/feature/stopwords/finnish.txt | 235 +++++++++++++ .../spark/ml/feature/stopwords/french.txt | 155 +++++++++ .../spark/ml/feature/stopwords/german.txt | 231 +++++++++++++ .../spark/ml/feature/stopwords/hungarian.txt | 199 +++++++++++ .../spark/ml/feature/stopwords/italian.txt | 279 +++++++++++++++ .../spark/ml/feature/stopwords/norwegian.txt | 176 ++++++++++ .../spark/ml/feature/stopwords/portuguese.txt | 203 +++++++++++ .../spark/ml/feature/stopwords/russian.txt | 151 +++++++++ .../spark/ml/feature/stopwords/spanish.txt | 313 +++++++++++++++++ .../spark/ml/feature/stopwords/swedish.txt | 114 +++++++ .../spark/ml/feature/stopwords/turkish.txt | 53 +++ 14 files changed, 2623 insertions(+) create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt new file mode 100644 index 000000000000..d3edc6757912 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt @@ -0,0 +1,94 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt new file mode 100644 index 000000000000..cafa0324b537 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt @@ -0,0 +1,101 @@ +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt new file mode 100644 index 000000000000..61e5350dcde3 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt @@ -0,0 +1,319 @@ +a +about +above +across +after +afterwards +again +against +all +almost +alone +along +already +also +although +always +am +among +amongst +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anyway +anywhere +are +around +as +at +back +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +below +beside +besides +between +beyond +bill +both +bottom +but +by +call +can +cannot +cant +co +computer +con +could +couldnt +cry +de +describe +detail +do +done +down +due +during +each +eg +eight +either +eleven +else +elsewhere +empty +enough +etc +even +ever +every +everyone +everything +everywhere +except +few +fifteen +fify +fill +find +fire +first +five +for +former +formerly +forty +found +four +from +front +full +further +get +give +go +had +has +hasnt +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +herself +him +himself +his +how +however +hundred +i +ie +if +in +inc +indeed +interest +into +is +it +its +itself +keep +last +latter +latterly +least +less +ltd +made +many +may +me +meanwhile +might +mill +mine +more +moreover +most +mostly +move +much +must +my +myself +name +namely +neither +never +nevertheless +next +nine +no +nobody +none +noone +nor +not +nothing +now +nowhere +of +off +often +on +once +one +only +onto +or +other +others +otherwise +our +ours +ourselves +out +over +own +part +per +perhaps +please +put +rather +re +same +see +seem +seemed +seeming +seems +serious +several +she +should +show +side +since +sincere +six +sixty +so +some +somehow +someone +something +sometime +sometimes +somewhere +still +such +system +take +ten +than +that +the +their +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +thereupon +these +they +thick +thin +third +this +those +though +three +through +throughout +thru +thus +to +together +too +top +toward +towards +twelve +twenty +two +un +under +until +up +upon +us +very +via +was +we +well +were +what +whatever +when +whence +whenever +where +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +whoever +whole +whom +whose +why +will +with +within +without +would +yet +you +your +yours +yourself +yourselves diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt new file mode 100644 index 000000000000..47ee200f6781 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt @@ -0,0 +1,235 @@ +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt new file mode 100644 index 000000000000..e7cbf4c97500 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt @@ -0,0 +1,155 @@ +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt new file mode 100644 index 000000000000..edef220b7a7d --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt @@ -0,0 +1,231 @@ +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unse +unsem +unsen +unser +unses +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt new file mode 100644 index 000000000000..94e9f9a0b07a --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt @@ -0,0 +1,199 @@ +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt new file mode 100644 index 000000000000..6ee02b51fb17 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt @@ -0,0 +1,279 @@ +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt new file mode 100644 index 000000000000..9ac1abbb6cba --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt @@ -0,0 +1,176 @@ +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt new file mode 100644 index 000000000000..6b2477863b7b --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt @@ -0,0 +1,203 @@ +de +a +o +que +e +do +da +em +um +para +com +não +uma +os +no +se +na +por +mais +as +dos +como +mas +ao +ele +das +à +seu +sua +ou +quando +muito +nos +já +eu +também +só +pelo +pela +até +isso +ela +entre +depois +sem +mesmo +aos +seus +quem +nas +me +esse +eles +você +essa +num +nem +suas +meu +às +minha +numa +pelos +elas +qual +nós +lhe +deles +essas +esses +pelas +este +dele +tu +te +vocês +vos +lhes +meus +minhas +teu +tua +teus +tuas +nosso +nossa +nossos +nossas +dela +delas +esta +estes +estas +aquele +aquela +aqueles +aquelas +isto +aquilo +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt new file mode 100644 index 000000000000..ecb83d4a7f39 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt @@ -0,0 +1,151 @@ +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt new file mode 100644 index 000000000000..59bc786caa49 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt @@ -0,0 +1,313 @@ +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosostros +vosostras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt new file mode 100644 index 000000000000..742bb6263b99 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt @@ -0,0 +1,114 @@ +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt new file mode 100644 index 000000000000..5a48ccce0737 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt @@ -0,0 +1,53 @@ +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani From a43039223a28b308ae1c14d33be5e5a1df382ed6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 01:43:15 +0200 Subject: [PATCH 05/22] adjust resource files --- .../danish.txt | 94 ------ .../dutch.txt | 101 ------ .../english.txt | 319 ------------------ .../finnish.txt | 235 ------------- .../french.txt | 155 --------- .../german.txt | 231 ------------- .../hungarian.txt | 199 ----------- .../italian.txt | 279 --------------- .../norwegian.txt | 176 ---------- .../portuguese.txt | 203 ----------- .../russian.txt | 151 --------- .../spanish.txt | 313 ----------------- .../swedish.txt | 114 ------- .../turkish.txt | 53 --- 14 files changed, 2623 deletions(-) delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt deleted file mode 100644 index d3edc6757912..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt +++ /dev/null @@ -1,94 +0,0 @@ -og -i -jeg -det -at -en -den -til -er -som -på -de -med -han -af -for -ikke -der -var -mig -sig -men -et -har -om -vi -min -havde -ham -hun -nu -over -da -fra -du -ud -sin -dem -os -op -man -hans -hvor -eller -hvad -skal -selv -her -alle -vil -blev -kunne -ind -når -være -dog -noget -ville -jo -deres -efter -ned -skulle -denne -end -dette -mit -også -under -have -dig -anden -hende -mine -alt -meget -sit -sine -vor -mod -disse -hvis -din -nogle -hos -blive -mange -ad -bliver -hendes -været -thi -jer -sådan diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt deleted file mode 100644 index cafa0324b537..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt +++ /dev/null @@ -1,101 +0,0 @@ -de -en -van -ik -te -dat -die -in -een -hij -het -niet -zijn -is -was -op -aan -met -als -voor -had -er -maar -om -hem -dan -zou -of -wat -mijn -men -dit -zo -door -over -ze -zich -bij -ook -tot -je -mij -uit -der -daar -haar -naar -heb -hoe -heeft -hebben -deze -u -want -nog -zal -me -zij -nu -ge -geen -omdat -iets -worden -toch -al -waren -veel -meer -doen -toen -moet -ben -zonder -kan -hun -dus -alles -onder -ja -eens -hier -wie -werd -altijd -doch -wordt -wezen -kunnen -ons -zelf -tegen -na -reeds -wil -kon -niets -uw -iemand -geweest -andere diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt deleted file mode 100644 index 61e5350dcde3..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt +++ /dev/null @@ -1,319 +0,0 @@ -a -about -above -across -after -afterwards -again -against -all -almost -alone -along -already -also -although -always -am -among -amongst -amoungst -amount -an -and -another -any -anyhow -anyone -anything -anyway -anywhere -are -around -as -at -back -be -became -because -become -becomes -becoming -been -before -beforehand -behind -being -below -beside -besides -between -beyond -bill -both -bottom -but -by -call -can -cannot -cant -co -computer -con -could -couldnt -cry -de -describe -detail -do -done -down -due -during -each -eg -eight -either -eleven -else -elsewhere -empty -enough -etc -even -ever -every -everyone -everything -everywhere -except -few -fifteen -fify -fill -find -fire -first -five -for -former -formerly -forty -found -four -from -front -full -further -get -give -go -had -has -hasnt -have -he -hence -her -here -hereafter -hereby -herein -hereupon -hers -herself -him -himself -his -how -however -hundred -i -ie -if -in -inc -indeed -interest -into -is -it -its -itself -keep -last -latter -latterly -least -less -ltd -made -many -may -me -meanwhile -might -mill -mine -more -moreover -most -mostly -move -much -must -my -myself -name -namely -neither -never -nevertheless -next -nine -no -nobody -none -noone -nor -not -nothing -now -nowhere -of -off -often -on -once -one -only -onto -or -other -others -otherwise -our -ours -ourselves -out -over -own -part -per -perhaps -please -put -rather -re -same -see -seem -seemed -seeming -seems -serious -several -she -should -show -side -since -sincere -six -sixty -so -some -somehow -someone -something -sometime -sometimes -somewhere -still -such -system -take -ten -than -that -the -their -them -themselves -then -thence -there -thereafter -thereby -therefore -therein -thereupon -these -they -thick -thin -third -this -those -though -three -through -throughout -thru -thus -to -together -too -top -toward -towards -twelve -twenty -two -un -under -until -up -upon -us -very -via -was -we -well -were -what -whatever -when -whence -whenever -where -whereafter -whereas -whereby -wherein -whereupon -wherever -whether -which -while -whither -who -whoever -whole -whom -whose -why -will -with -within -without -would -yet -you -your -yours -yourself -yourselves diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt deleted file mode 100644 index 47ee200f6781..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt +++ /dev/null @@ -1,235 +0,0 @@ -olla -olen -olet -on -olemme -olette -ovat -ole -oli -olisi -olisit -olisin -olisimme -olisitte -olisivat -olit -olin -olimme -olitte -olivat -ollut -olleet -en -et -ei -emme -ette -eivät -minä -minun -minut -minua -minussa -minusta -minuun -minulla -minulta -minulle -sinä -sinun -sinut -sinua -sinussa -sinusta -sinuun -sinulla -sinulta -sinulle -hän -hänen -hänet -häntä -hänessä -hänestä -häneen -hänellä -häneltä -hänelle -me -meidän -meidät -meitä -meissä -meistä -meihin -meillä -meiltä -meille -te -teidän -teidät -teitä -teissä -teistä -teihin -teillä -teiltä -teille -he -heidän -heidät -heitä -heissä -heistä -heihin -heillä -heiltä -heille -tämä -tämän -tätä -tässä -tästä -tähän -tallä -tältä -tälle -tänä -täksi -tuo -tuon -tuotä -tuossa -tuosta -tuohon -tuolla -tuolta -tuolle -tuona -tuoksi -se -sen -sitä -siinä -siitä -siihen -sillä -siltä -sille -sinä -siksi -nämä -näiden -näitä -näissä -näistä -näihin -näillä -näiltä -näille -näinä -näiksi -nuo -noiden -noita -noissa -noista -noihin -noilla -noilta -noille -noina -noiksi -ne -niiden -niitä -niissä -niistä -niihin -niillä -niiltä -niille -niinä -niiksi -kuka -kenen -kenet -ketä -kenessä -kenestä -keneen -kenellä -keneltä -kenelle -kenenä -keneksi -ketkä -keiden -ketkä -keitä -keissä -keistä -keihin -keillä -keiltä -keille -keinä -keiksi -mikä -minkä -minkä -mitä -missä -mistä -mihin -millä -miltä -mille -minä -miksi -mitkä -joka -jonka -jota -jossa -josta -johon -jolla -jolta -jolle -jona -joksi -jotka -joiden -joita -joissa -joista -joihin -joilla -joilta -joille -joina -joiksi -että -ja -jos -koska -kuin -mutta -niin -sekä -sillä -tai -vaan -vai -vaikka -kanssa -mukaan -noin -poikki -yli -kun -niin -nyt -itse diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt deleted file mode 100644 index e7cbf4c97500..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt +++ /dev/null @@ -1,155 +0,0 @@ -au -aux -avec -ce -ces -dans -de -des -du -elle -en -et -eux -il -je -la -le -leur -lui -ma -mais -me -même -mes -moi -mon -ne -nos -notre -nous -on -ou -par -pas -pour -qu -que -qui -sa -se -ses -son -sur -ta -te -tes -toi -ton -tu -un -une -vos -votre -vous -c -d -j -l -à -m -n -s -t -y -été -étée -étées -étés -étant -étante -étants -étantes -suis -es -est -sommes -êtes -sont -serai -seras -sera -serons -serez -seront -serais -serait -serions -seriez -seraient -étais -était -étions -étiez -étaient -fus -fut -fûmes -fûtes -furent -sois -soit -soyons -soyez -soient -fusse -fusses -fût -fussions -fussiez -fussent -ayant -ayante -ayantes -ayants -eu -eue -eues -eus -ai -as -avons -avez -ont -aurai -auras -aura -aurons -aurez -auront -aurais -aurait -aurions -auriez -auraient -avais -avait -avions -aviez -avaient -eut -eûmes -eûtes -eurent -aie -aies -ait -ayons -ayez -aient -eusse -eusses -eût -eussions -eussiez -eussent diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt deleted file mode 100644 index edef220b7a7d..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt +++ /dev/null @@ -1,231 +0,0 @@ -aber -alle -allem -allen -aller -alles -als -also -am -an -ander -andere -anderem -anderen -anderer -anderes -anderm -andern -anderr -anders -auch -auf -aus -bei -bin -bis -bist -da -damit -dann -der -den -des -dem -die -das -daß -derselbe -derselben -denselben -desselben -demselben -dieselbe -dieselben -dasselbe -dazu -dein -deine -deinem -deinen -deiner -deines -denn -derer -dessen -dich -dir -du -dies -diese -diesem -diesen -dieser -dieses -doch -dort -durch -ein -eine -einem -einen -einer -eines -einig -einige -einigem -einigen -einiger -einiges -einmal -er -ihn -ihm -es -etwas -euer -eure -eurem -euren -eurer -eures -für -gegen -gewesen -hab -habe -haben -hat -hatte -hatten -hier -hin -hinter -ich -mich -mir -ihr -ihre -ihrem -ihren -ihrer -ihres -euch -im -in -indem -ins -ist -jede -jedem -jeden -jeder -jedes -jene -jenem -jenen -jener -jenes -jetzt -kann -kein -keine -keinem -keinen -keiner -keines -können -könnte -machen -man -manche -manchem -manchen -mancher -manches -mein -meine -meinem -meinen -meiner -meines -mit -muss -musste -nach -nicht -nichts -noch -nun -nur -ob -oder -ohne -sehr -sein -seine -seinem -seinen -seiner -seines -selbst -sich -sie -ihnen -sind -so -solche -solchem -solchen -solcher -solches -soll -sollte -sondern -sonst -über -um -und -uns -unse -unsem -unsen -unser -unses -unter -viel -vom -von -vor -während -war -waren -warst -was -weg -weil -weiter -welche -welchem -welchen -welcher -welches -wenn -werde -werden -wie -wieder -will -wir -wird -wirst -wo -wollen -wollte -würde -würden -zu -zum -zur -zwar -zwischen diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt deleted file mode 100644 index 94e9f9a0b07a..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt +++ /dev/null @@ -1,199 +0,0 @@ -a -ahogy -ahol -aki -akik -akkor -alatt -által -általában -amely -amelyek -amelyekben -amelyeket -amelyet -amelynek -ami -amit -amolyan -amíg -amikor -át -abban -ahhoz -annak -arra -arról -az -azok -azon -azt -azzal -azért -aztán -azután -azonban -bár -be -belül -benne -cikk -cikkek -cikkeket -csak -de -e -eddig -egész -egy -egyes -egyetlen -egyéb -egyik -egyre -ekkor -el -elég -ellen -elõ -elõször -elõtt -elsõ -én -éppen -ebben -ehhez -emilyen -ennek -erre -ez -ezt -ezek -ezen -ezzel -ezért -és -fel -felé -hanem -hiszen -hogy -hogyan -igen -így -illetve -ill. -ill -ilyen -ilyenkor -ison -ismét -itt -jó -jól -jobban -kell -kellett -keresztül -keressünk -ki -kívül -között -közül -legalább -lehet -lehetett -legyen -lenne -lenni -lesz -lett -maga -magát -majd -majd -már -más -másik -meg -még -mellett -mert -mely -melyek -mi -mit -míg -miért -milyen -mikor -minden -mindent -mindenki -mindig -mint -mintha -mivel -most -nagy -nagyobb -nagyon -ne -néha -nekem -neki -nem -néhány -nélkül -nincs -olyan -ott -össze -õ -õk -õket -pedig -persze -rá -s -saját -sem -semmi -sok -sokat -sokkal -számára -szemben -szerint -szinte -talán -tehát -teljes -tovább -továbbá -több -úgy -ugyanis -új -újabb -újra -után -utána -utolsó -vagy -vagyis -valaki -valami -valamint -való -vagyok -van -vannak -volt -voltam -voltak -voltunk -vissza -vele -viszont -volna diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt deleted file mode 100644 index 6ee02b51fb17..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt +++ /dev/null @@ -1,279 +0,0 @@ -ad -al -allo -ai -agli -all -agl -alla -alle -con -col -coi -da -dal -dallo -dai -dagli -dall -dagl -dalla -dalle -di -del -dello -dei -degli -dell -degl -della -delle -in -nel -nello -nei -negli -nell -negl -nella -nelle -su -sul -sullo -sui -sugli -sull -sugl -sulla -sulle -per -tra -contro -io -tu -lui -lei -noi -voi -loro -mio -mia -miei -mie -tuo -tua -tuoi -tue -suo -sua -suoi -sue -nostro -nostra -nostri -nostre -vostro -vostra -vostri -vostre -mi -ti -ci -vi -lo -la -li -le -gli -ne -il -un -uno -una -ma -ed -se -perché -anche -come -dov -dove -che -chi -cui -non -più -quale -quanto -quanti -quanta -quante -quello -quelli -quella -quelle -questo -questi -questa -queste -si -tutto -tutti -a -c -e -i -l -o -ho -hai -ha -abbiamo -avete -hanno -abbia -abbiate -abbiano -avrò -avrai -avrà -avremo -avrete -avranno -avrei -avresti -avrebbe -avremmo -avreste -avrebbero -avevo -avevi -aveva -avevamo -avevate -avevano -ebbi -avesti -ebbe -avemmo -aveste -ebbero -avessi -avesse -avessimo -avessero -avendo -avuto -avuta -avuti -avute -sono -sei -è -siamo -siete -sia -siate -siano -sarò -sarai -sarà -saremo -sarete -saranno -sarei -saresti -sarebbe -saremmo -sareste -sarebbero -ero -eri -era -eravamo -eravate -erano -fui -fosti -fu -fummo -foste -furono -fossi -fosse -fossimo -fossero -essendo -faccio -fai -facciamo -fanno -faccia -facciate -facciano -farò -farai -farà -faremo -farete -faranno -farei -faresti -farebbe -faremmo -fareste -farebbero -facevo -facevi -faceva -facevamo -facevate -facevano -feci -facesti -fece -facemmo -faceste -fecero -facessi -facesse -facessimo -facessero -facendo -sto -stai -sta -stiamo -stanno -stia -stiate -stiano -starò -starai -starà -staremo -starete -staranno -starei -staresti -starebbe -staremmo -stareste -starebbero -stavo -stavi -stava -stavamo -stavate -stavano -stetti -stesti -stette -stemmo -steste -stettero -stessi -stesse -stessimo -stessero -stando diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt deleted file mode 100644 index 9ac1abbb6cba..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt +++ /dev/null @@ -1,176 +0,0 @@ -og -i -jeg -det -at -en -et -den -til -er -som -på -de -med -han -av -ikke -ikkje -der -så -var -meg -seg -men -ett -har -om -vi -min -mitt -ha -hadde -hun -nå -over -da -ved -fra -du -ut -sin -dem -oss -opp -man -kan -hans -hvor -eller -hva -skal -selv -sjøl -her -alle -vil -bli -ble -blei -blitt -kunne -inn -når -være -kom -noen -noe -ville -dere -som -deres -kun -ja -etter -ned -skulle -denne -for -deg -si -sine -sitt -mot -å -meget -hvorfor -dette -disse -uten -hvordan -ingen -din -ditt -blir -samme -hvilken -hvilke -sånn -inni -mellom -vår -hver -hvem -vors -hvis -både -bare -enn -fordi -før -mange -også -slik -vært -være -båe -begge -siden -dykk -dykkar -dei -deira -deires -deim -di -då -eg -ein -eit -eitt -elles -honom -hjå -ho -hoe -henne -hennar -hennes -hoss -hossen -ikkje -ingi -inkje -korleis -korso -kva -kvar -kvarhelst -kven -kvi -kvifor -me -medan -mi -mine -mykje -no -nokon -noka -nokor -noko -nokre -si -sia -sidan -so -somt -somme -um -upp -vere -vore -verte -vort -varte -vart diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt deleted file mode 100644 index 6b2477863b7b..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt +++ /dev/null @@ -1,203 +0,0 @@ -de -a -o -que -e -do -da -em -um -para -com -não -uma -os -no -se -na -por -mais -as -dos -como -mas -ao -ele -das -à -seu -sua -ou -quando -muito -nos -já -eu -também -só -pelo -pela -até -isso -ela -entre -depois -sem -mesmo -aos -seus -quem -nas -me -esse -eles -você -essa -num -nem -suas -meu -às -minha -numa -pelos -elas -qual -nós -lhe -deles -essas -esses -pelas -este -dele -tu -te -vocês -vos -lhes -meus -minhas -teu -tua -teus -tuas -nosso -nossa -nossos -nossas -dela -delas -esta -estes -estas -aquele -aquela -aqueles -aquelas -isto -aquilo -estou -está -estamos -estão -estive -esteve -estivemos -estiveram -estava -estávamos -estavam -estivera -estivéramos -esteja -estejamos -estejam -estivesse -estivéssemos -estivessem -estiver -estivermos -estiverem -hei -há -havemos -hão -houve -houvemos -houveram -houvera -houvéramos -haja -hajamos -hajam -houvesse -houvéssemos -houvessem -houver -houvermos -houverem -houverei -houverá -houveremos -houverão -houveria -houveríamos -houveriam -sou -somos -são -era -éramos -eram -fui -foi -fomos -foram -fora -fôramos -seja -sejamos -sejam -fosse -fôssemos -fossem -for -formos -forem -serei -será -seremos -serão -seria -seríamos -seriam -tenho -tem -temos -tém -tinha -tínhamos -tinham -tive -teve -tivemos -tiveram -tivera -tivéramos -tenha -tenhamos -tenham -tivesse -tivéssemos -tivessem -tiver -tivermos -tiverem -terei -terá -teremos -terão -teria -teríamos -teriam diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt deleted file mode 100644 index ecb83d4a7f39..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt +++ /dev/null @@ -1,151 +0,0 @@ -и -в -во -не -что -он -на -я -с -со -как -а -то -все -она -так -его -но -да -ты -к -у -же -вы -за -бы -по -только -ее -мне -было -вот -от -меня -еще -нет -о -из -ему -теперь -когда -даже -ну -вдруг -ли -если -уже -или -ни -быть -был -него -до -вас -нибудь -опять -уж -вам -ведь -там -потом -себя -ничего -ей -может -они -тут -где -есть -надо -ней -для -мы -тебя -их -чем -была -сам -чтоб -без -будто -чего -раз -тоже -себе -под -будет -ж -тогда -кто -этот -того -потому -этого -какой -совсем -ним -здесь -этом -один -почти -мой -тем -чтобы -нее -сейчас -были -куда -зачем -всех -никогда -можно -при -наконец -два -об -другой -хоть -после -над -больше -тот -через -эти -нас -про -всего -них -какая -много -разве -три -эту -моя -впрочем -хорошо -свою -этой -перед -иногда -лучше -чуть -том -нельзя -такой -им -более -всегда -конечно -всю -между diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt deleted file mode 100644 index 59bc786caa49..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt +++ /dev/null @@ -1,313 +0,0 @@ -de -la -que -el -en -y -a -los -del -se -las -por -un -para -con -no -una -su -al -lo -como -más -pero -sus -le -ya -o -este -sí -porque -esta -entre -cuando -muy -sin -sobre -también -me -hasta -hay -donde -quien -desde -todo -nos -durante -todos -uno -les -ni -contra -otros -ese -eso -ante -ellos -e -esto -mí -antes -algunos -qué -unos -yo -otro -otras -otra -él -tanto -esa -estos -mucho -quienes -nada -muchos -cual -poco -ella -estar -estas -algunas -algo -nosotros -mi -mis -tú -te -ti -tu -tus -ellas -nosotras -vosostros -vosostras -os -mío -mía -míos -mías -tuyo -tuya -tuyos -tuyas -suyo -suya -suyos -suyas -nuestro -nuestra -nuestros -nuestras -vuestro -vuestra -vuestros -vuestras -esos -esas -estoy -estás -está -estamos -estáis -están -esté -estés -estemos -estéis -estén -estaré -estarás -estará -estaremos -estaréis -estarán -estaría -estarías -estaríamos -estaríais -estarían -estaba -estabas -estábamos -estabais -estaban -estuve -estuviste -estuvo -estuvimos -estuvisteis -estuvieron -estuviera -estuvieras -estuviéramos -estuvierais -estuvieran -estuviese -estuvieses -estuviésemos -estuvieseis -estuviesen -estando -estado -estada -estados -estadas -estad -he -has -ha -hemos -habéis -han -haya -hayas -hayamos -hayáis -hayan -habré -habrás -habrá -habremos -habréis -habrán -habría -habrías -habríamos -habríais -habrían -había -habías -habíamos -habíais -habían -hube -hubiste -hubo -hubimos -hubisteis -hubieron -hubiera -hubieras -hubiéramos -hubierais -hubieran -hubiese -hubieses -hubiésemos -hubieseis -hubiesen -habiendo -habido -habida -habidos -habidas -soy -eres -es -somos -sois -son -sea -seas -seamos -seáis -sean -seré -serás -será -seremos -seréis -serán -sería -serías -seríamos -seríais -serían -era -eras -éramos -erais -eran -fui -fuiste -fue -fuimos -fuisteis -fueron -fuera -fueras -fuéramos -fuerais -fueran -fuese -fueses -fuésemos -fueseis -fuesen -sintiendo -sentido -sentida -sentidos -sentidas -siente -sentid -tengo -tienes -tiene -tenemos -tenéis -tienen -tenga -tengas -tengamos -tengáis -tengan -tendré -tendrás -tendrá -tendremos -tendréis -tendrán -tendría -tendrías -tendríamos -tendríais -tendrían -tenía -tenías -teníamos -teníais -tenían -tuve -tuviste -tuvo -tuvimos -tuvisteis -tuvieron -tuviera -tuvieras -tuviéramos -tuvierais -tuvieran -tuviese -tuvieses -tuviésemos -tuvieseis -tuviesen -teniendo -tenido -tenida -tenidos -tenidas -tened diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt deleted file mode 100644 index 742bb6263b99..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt +++ /dev/null @@ -1,114 +0,0 @@ -och -det -att -i -en -jag -hon -som -han -på -den -med -var -sig -för -så -till -är -men -ett -om -hade -de -av -icke -mig -du -henne -då -sin -nu -har -inte -hans -honom -skulle -hennes -där -min -man -ej -vid -kunde -något -från -ut -när -efter -upp -vi -dem -vara -vad -över -än -dig -kan -sina -här -ha -mot -alla -under -någon -eller -allt -mycket -sedan -ju -denna -själv -detta -åt -utan -varit -hur -ingen -mitt -ni -bli -blev -oss -din -dessa -några -deras -blir -mina -samma -vilken -er -sådan -vår -blivit -dess -inom -mellan -sådant -varför -varje -vilka -ditt -vem -vilket -sitta -sådana -vart -dina -vars -vårt -våra -ert -era -vilkas diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt deleted file mode 100644 index 5a48ccce0737..000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt +++ /dev/null @@ -1,53 +0,0 @@ -acaba -ama -aslında -az -bazı -belki -biri -birkaç -birşey -biz -bu -çok -çünkü -da -daha -de -defa -diye -eğer -en -gibi -hem -hep -hepsi -her -hiç -için -ile -ise -kez -ki -kim -mı -mu -mü -nasıl -ne -neden -nerde -nerede -nereye -niçin -niye -o -sanki -şey -siz -şu -tüm -ve -veya -ya -yani From 28ee249f676971371d11d16c2912bbf81e045269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 01:46:42 +0200 Subject: [PATCH 06/22] fix stopwords bug --- .../spark/ml/feature/StopWordsRemover.scala | 74 +++++++++---------- 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index e7f1d8323376..56db88950cab 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -19,49 +19,40 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, StringArrayParam} +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructType} /** - * stop words list - */ + * stop words list + */ private[spark] object StopWords { + /** Read stop words list from resources */ def readStopWords(language: String): Array[String] = { - val is = getClass.getResourceAsStream(s"/$language.txt") + val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") scala.io.Source.fromInputStream(is).getLines().toArray } - lazy val Danish = readStopWords("/danish.txt") - lazy val Dutch = readStopWords("/dutch.txt") - lazy val English = readStopWords("/english.txt") - lazy val Finnish = readStopWords("/finnish.txt") - lazy val French = readStopWords("/french.txt") - lazy val German = readStopWords("/german.txt") - lazy val Hungarian = readStopWords("/hungarian.txt") - lazy val Italian = readStopWords("/italian.txt") - lazy val Norwegian = readStopWords("/norwegian.txt") - lazy val Portuguese = readStopWords("/portuguese.txt") - lazy val Russian = readStopWords("/russian.txt") - lazy val Spanish = readStopWords("/spanish.txt") - lazy val Swedish = readStopWords("/swedish.txt") - lazy val Turkish = readStopWords("/turkish.txt") - - val languageMap = Map("danish" -> Danish, "dutch" -> Dutch, "english" -> English, - "finnish" -> Finnish, "french" -> French, "german" -> German, "hungarian" -> Hungarian, - "italian" -> Italian, "norwegian" -> Norwegian, "portuguese" -> Portuguese, - "russian" -> Russian, "spanish" -> Spanish, "swedish" -> Swedish, "turkish" -> Turkish) + /** Supported languages list must be lowercase */ + val supportedLanguages = Array("danish", "dutch", "english", "finnish", "french", "german", + "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") + + /** Languages and stopwords map */ + val languageMap = supportedLanguages.map{ + language => language -> readStopWords(language) + }.toMap } /** - * :: Experimental :: - * A feature transformer that filters out stop words from input. - * Note: null values from input array are preserved unless adding null to stopWords explicitly. - * @see [[http://en.wikipedia.org/wiki/Stop_words]] - */ + * :: Experimental :: + * A feature transformer that filters out stop words from input. + * Note: null values from input array are preserved unless adding null to stopWords explicitly. + * @see [[http://en.wikipedia.org/wiki/Stop_words]] + */ @Experimental class StopWordsRemover(override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { @@ -75,10 +66,10 @@ class StopWordsRemover(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * the stop words set to be filtered out - * Default: [[StopWords.English]] - * @group param - */ + * the stop words set to be filtered out + * Default: [[StopWords.English]] + * @group param + */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") /** @group setParam */ @@ -91,10 +82,10 @@ class StopWordsRemover(override val uid: String) def getStopWords: Array[String] = $(stopWords) /** - * whether to do a case sensitive comparison over the stop words - * Default: false - * @group param - */ + * whether to do a case sensitive comparison over the stop words + * Default: false + * @group param + */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", "whether to do case-sensitive comparison during filtering") @@ -150,13 +141,16 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getAdditionalWords: Array[String] = $(additionalWords) - setDefault(stopWords -> StopWords.English, language -> "en", ignoredWords -> Array.empty[String] - , additionalWords -> Array.empty[String], caseSensitive -> false) + setDefault(stopWords -> StopWords.languageMap("english"), + language -> "en", + ignoredWords -> Array.empty[String], + additionalWords -> Array.empty[String], + caseSensitive -> false) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val t = if ($(caseSensitive)) { - val stopWordsSet = ($(stopWords) ++ $(additionalWords)).toSet - $(ignoredWords).toSet + val stopWordsSet = ($(stopWords) ++ $(additionalWords)).toSet -- $(ignoredWords).toSet udf { terms: Seq[String] => terms.filter(s => !stopWordsSet.contains(s)) } @@ -164,7 +158,7 @@ class StopWordsRemover(override val uid: String) val toLower = (s: String) => if (s != null) s.toLowerCase else s val lowerStopWords = { ($(stopWords) ++ $(additionalWords)) - .map(toLower(_)).toSet - $(ignoredWords).map(toLower(_)).toSet + .map(toLower(_)).toSet -- $(ignoredWords).map(toLower(_)).toSet } udf { terms: Seq[String] => terms.filter(s => !lowerStopWords.contains(toLower(s))) From 6d215b31a205c4a79e8cc0ef6963d239941e80ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 01:53:06 +0200 Subject: [PATCH 07/22] update comment lines --- .../scala/org/apache/spark/ml/feature/StopWordsRemover.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 56db88950cab..40674b2aaee0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -67,7 +67,7 @@ class StopWordsRemover(override val uid: String) /** * the stop words set to be filtered out - * Default: [[StopWords.English]] + * Default: [[StopWords.languageMap("english")]] * @group param */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") @@ -142,7 +142,7 @@ class StopWordsRemover(override val uid: String) def getAdditionalWords: Array[String] = $(additionalWords) setDefault(stopWords -> StopWords.languageMap("english"), - language -> "en", + language -> "english", ignoredWords -> Array.empty[String], additionalWords -> Array.empty[String], caseSensitive -> false) From 6deceecf88c66b3293698aca5d7306c2aa02e2e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 18:24:38 +0200 Subject: [PATCH 08/22] update stop words list --- .../spark/ml/feature/stopwords/danish.txt | 2 +- .../spark/ml/feature/stopwords/dutch.txt | 2 +- .../spark/ml/feature/stopwords/english.txt | 422 ++++++------------ .../spark/ml/feature/stopwords/finnish.txt | 2 +- .../spark/ml/feature/stopwords/french.txt | 2 +- .../spark/ml/feature/stopwords/german.txt | 2 +- .../spark/ml/feature/stopwords/hungarian.txt | 2 +- .../spark/ml/feature/stopwords/italian.txt | 2 +- .../spark/ml/feature/stopwords/norwegian.txt | 2 +- .../spark/ml/feature/stopwords/portuguese.txt | 2 +- .../spark/ml/feature/stopwords/russian.txt | 2 +- .../spark/ml/feature/stopwords/spanish.txt | 2 +- .../spark/ml/feature/stopwords/swedish.txt | 2 +- .../spark/ml/feature/stopwords/turkish.txt | 2 +- 14 files changed, 141 insertions(+), 307 deletions(-) diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt index d3edc6757912..ea9e2c4abe5b 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt @@ -91,4 +91,4 @@ hendes været thi jer -sådan +sådan \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt index cafa0324b537..023cc2c939b2 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt @@ -98,4 +98,4 @@ niets uw iemand geweest -andere +andere \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt index 61e5350dcde3..d075cc0babc3 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt @@ -1,319 +1,153 @@ -a -about -above -across -after -afterwards -again -against -all -almost -alone -along -already -also -although -always +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those am -among -amongst -amoungst -amount -an -and -another -any -anyhow -anyone -anything -anyway -anywhere +is are -around -as -at -back +was +were be -became -because -become -becomes -becoming been -before -beforehand -behind being -below -beside -besides -between -beyond -bill -both -bottom +have +has +had +having +do +does +did +doing +a +an +the +and but +if +or +because +as +until +while +of +at by -call -can -cannot -cant -co -computer -con -could -couldnt -cry -de -describe -detail -do -done -down -due -during -each -eg -eight -either -eleven -else -elsewhere -empty -enough -etc -even -ever -every -everyone -everything -everywhere -except -few -fifteen -fify -fill -find -fire -first -five for -former -formerly -forty -found -four +with +about +against +between +into +through +during +before +after +above +below +to from -front -full +up +down +in +out +on +off +over +under +again further -get -give -go -had -has -hasnt -have -he -hence -her +then +once here -hereafter -hereby -herein -hereupon -hers -herself -him -himself -his +there +when +where +why how -however -hundred -i -ie -if -in -inc -indeed -interest -into -is -it -its -itself -keep -last -latter -latterly -least -less -ltd -made -many -may -me -meanwhile -might -mill -mine +all +any +both +each +few more -moreover most -mostly -move -much -must -my -myself -name -namely -neither -never -nevertheless -next -nine +other +some +such no -nobody -none -noone nor not -nothing -now -nowhere -of -off -often -on -once -one only -onto -or -other -others -otherwise -our -ours -ourselves -out -over own -part -per -perhaps -please -put -rather -re same -see -seem -seemed -seeming -seems -serious -several -she -should -show -side -since -sincere -six -sixty so -some -somehow -someone -something -sometime -sometimes -somewhere -still -such -system -take -ten than -that -the -their -them -themselves -then -thence -there -thereafter -thereby -therefore -therein -thereupon -these -they -thick -thin -third -this -those -though -three -through -throughout -thru -thus -to -together too -top -toward -towards -twelve -twenty -two -un -under -until -up -upon -us very -via -was -we -well -were -what -whatever -when -whence -whenever -where -whereafter -whereas -whereby -wherein -whereupon -wherever -whether -which -while -whither -who -whoever -whole -whom -whose -why +s +t +can will -with -within -without -would -yet -you -your -yours -yourself -yourselves +just +don +should +now +d +ll +m +o +re +ve +y +ain +aren +couldn +didn +doesn +hadn +hasn +haven +isn +ma +mightn +mustn +needn +shan +shouldn +wasn +weren +won +wouldn diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt index 47ee200f6781..5b0eb10777d0 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt @@ -232,4 +232,4 @@ yli kun niin nyt -itse +itse \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt index e7cbf4c97500..94b8f8f39a3e 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt @@ -152,4 +152,4 @@ eusses eût eussions eussiez -eussent +eussent \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt index edef220b7a7d..7e65190f8ba2 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt @@ -228,4 +228,4 @@ zu zum zur zwar -zwischen +zwischen \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt index 94e9f9a0b07a..8d4543a0965d 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt @@ -196,4 +196,4 @@ voltunk vissza vele viszont -volna +volna \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt index 6ee02b51fb17..783b2e0cbfcd 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt @@ -276,4 +276,4 @@ stessi stesse stessimo stessero -stando +stando \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt index 9ac1abbb6cba..cb91702c5e9a 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt @@ -173,4 +173,4 @@ vore verte vort varte -vart +vart \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt index 6b2477863b7b..98b4fdcdf7a2 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt @@ -200,4 +200,4 @@ teremos terão teria teríamos -teriam +teriam \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt index ecb83d4a7f39..8a800b74497d 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt @@ -148,4 +148,4 @@ всегда конечно всю -между +между \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt index 59bc786caa49..94f493a8d1e0 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt @@ -310,4 +310,4 @@ tenido tenida tenidos tenidas -tened +tened \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt index 742bb6263b99..9fae31c1858a 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt @@ -111,4 +111,4 @@ vårt våra ert era -vilkas +vilkas \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt index 5a48ccce0737..4e9708d9d2c5 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt @@ -50,4 +50,4 @@ tüm ve veya ya -yani +yani \ No newline at end of file From 41cd25815af3baa8fe9ed9336812f436d7ed7bd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 18:25:36 +0200 Subject: [PATCH 09/22] update stopwordsremover --- .../spark/ml/feature/StopWordsRemover.scala | 83 ++++++------------- .../ml/feature/StopWordsRemoverSuite.scala | 26 +++++- 2 files changed, 48 insertions(+), 61 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 40674b2aaee0..f3cd55c1984e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -33,18 +33,14 @@ private[spark] object StopWords { /** Read stop words list from resources */ def readStopWords(language: String): Array[String] = { + require(supportedLanguages.contains(language), s"$language is not in language list") val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") scala.io.Source.fromInputStream(is).getLines().toArray } /** Supported languages list must be lowercase */ - val supportedLanguages = Array("danish", "dutch", "english", "finnish", "french", "german", + private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") - - /** Languages and stopwords map */ - val languageMap = supportedLanguages.map{ - language => language -> readStopWords(language) - }.toMap } /** @@ -67,16 +63,13 @@ class StopWordsRemover(override val uid: String) /** * the stop words set to be filtered out - * Default: [[StopWords.languageMap("english")]] + * Default: [[Array.empty]] * @group param */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") /** @group setParam */ - def setStopWords(value: Array[String]): this.type = { - set(stopWords, value) - set(language, "unknown") - } + def setStopWords(value: Array[String]): this.type = set(stopWords, value) /** @group getParam */ def getStopWords: Array[String] = $(stopWords) @@ -96,70 +89,39 @@ class StopWordsRemover(override val uid: String) def getCaseSensitive: Boolean = $(caseSensitive) /** - * the language of stop words - * Default: "english" - * @group param - */ + * the language of stop words + * Supported languages: Danish, Dutch, English, Finnish, French, German, Hungarian, + * Italian, Norwegian, Portuguese, Russian, Spanish, Swedish, Turkish + * Default: "English" + * @group param + */ val language: Param[String] = new Param[String](this, "language", "stopwords language") /** @group setParam */ - def setLanguage(value: String): this.type = { - val lang = value.toLowerCase - require(StopWords.languageMap.contains(lang), s"$lang is not in language list") - set(language, lang) - set(stopWords, StopWords.languageMap(lang)) - } + def setLanguage(value: String): this.type = set(language, value.toLowerCase) /** @group getParam */ def getLanguage: String = $(language) - /** - * the ignored stop words set to be ignored out - * Default: [[Array.empty]] - * @group param - */ - val ignoredWords: StringArrayParam = new StringArrayParam(this, "ignoredWords", - "the ignored stop words set to be ignored out") - - /** @group setParam */ - def setIgnoredWords(value: Array[String]): this.type = set(ignoredWords, value) - - /** @group getParam */ - def getIgnoredWords: Array[String] = $(ignoredWords) - - /** - * the additional stop words set to be filtered out - * Default: [[Array.empty]] - * @group param - */ - val additionalWords: StringArrayParam = new StringArrayParam(this, "additionalWords", - "the additional stop words set to be filtered out") - - /** @group setParam */ - def setAdditionalWords(value: Array[String]): this.type = set(additionalWords, value) - - /** @group getParam */ - def getAdditionalWords: Array[String] = $(additionalWords) - - setDefault(stopWords -> StopWords.languageMap("english"), + setDefault(stopWords -> Array.empty[String], language -> "english", - ignoredWords -> Array.empty[String], - additionalWords -> Array.empty[String], caseSensitive -> false) override def transform(dataset: DataFrame): DataFrame = { + val stopWordsSet = if ($(stopWords).isEmpty) { + StopWords.readStopWords($(language)).toSet + } else { + $(stopWords).toSet + } + val outputSchema = transformSchema(dataset.schema) val t = if ($(caseSensitive)) { - val stopWordsSet = ($(stopWords) ++ $(additionalWords)).toSet -- $(ignoredWords).toSet udf { terms: Seq[String] => terms.filter(s => !stopWordsSet.contains(s)) } } else { val toLower = (s: String) => if (s != null) s.toLowerCase else s - val lowerStopWords = { - ($(stopWords) ++ $(additionalWords)) - .map(toLower(_)).toSet -- $(ignoredWords).map(toLower(_)).toSet - } + val lowerStopWords = stopWordsSet.map(toLower(_)).toSet udf { terms: Seq[String] => terms.filter(s => !lowerStopWords.contains(toLower(s))) } @@ -185,4 +147,11 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { @Since("1.6.0") override def load(path: String): StopWordsRemover = super.load(path) + + /** + * Stop words for the language + * Supported languages: Danish, Dutch, English, Finnish, French, German, Hungarian, + * Italian, Norwegian, Portuguese, Russian, Spanish, Swedish, Turkish + */ + def loadStopWords(language: String): Array[String] = StopWords.readStopWords(language.toLowerCase) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 92c177ad6861..44fb1c8f04de 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -54,6 +54,24 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } + test("StopWordsRemover with particular stop words list") { + val stopWords = Array("test", "a", "an", "the") + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + .setStopWords(stopWords) + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("test", "test"), Seq()), + (Seq("a", "b", "c", "d"), Seq("b", "c")), + (Seq("a", "the", "an"), Seq()), + (Seq("A", "The", "AN"), Seq()), + (Seq(null), Seq(null)), + (Seq(), Seq()) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } + test("StopWordsRemover case sensitive") { val remover = new StopWordsRemover() .setInputCol("raw") @@ -68,11 +86,11 @@ class StopWordsRemoverSuite } test("StopWordsRemover with ignored words") { - val ignoredWords = Array("a") + val stopWords = StopWordsRemover.loadStopWords("english").toSet -- Set("a") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setIgnoredWords(ignoredWords) + .setStopWords(stopWords.toArray) val dataSet = sqlContext.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq("python", "scala", "a")), (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) @@ -82,11 +100,11 @@ class StopWordsRemoverSuite } test("StopWordsRemover with additional words") { - val additionalWords = Array("python", "scala") + val stopWords = StopWordsRemover.loadStopWords("english").toSet ++ Set("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setAdditionalWords(additionalWords) + .setStopWords(stopWords.toArray) val dataSet = sqlContext.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq()), (Seq("Python", "Scala", "swift"), Seq("swift")) From 4d1812aae64b0b15312940b1a6c42e19f9686480 Mon Sep 17 00:00:00 2001 From: Burak KOSE Date: Tue, 22 Mar 2016 19:35:37 +0200 Subject: [PATCH 10/22] fix test case bug After updating English stop words list, "d" is a stop word. --- .../org/apache/spark/ml/feature/StopWordsRemoverSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 44fb1c8f04de..89727e74f41e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -44,7 +44,7 @@ class StopWordsRemoverSuite .setOutputCol("filtered") val dataSet = sqlContext.createDataFrame(Seq( (Seq("test", "test"), Seq("test", "test")), - (Seq("a", "b", "c", "d"), Seq("b", "c", "d")), + (Seq("a", "b", "c", "d"), Seq("b", "c")), (Seq("a", "the", "an"), Seq()), (Seq("A", "The", "AN"), Seq()), (Seq(null), Seq(null)), From a30862231c3944c55c96cc94e162f61614aee6d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 23:45:48 +0200 Subject: [PATCH 11/22] fix encoding --- .../scala/org/apache/spark/ml/feature/StopWordsRemover.scala | 2 +- .../org/apache/spark/ml/feature/StopWordsRemoverSuite.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index f3cd55c1984e..ab77d9570e94 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -35,7 +35,7 @@ private[spark] object StopWords { def readStopWords(language: String): Array[String] = { require(supportedLanguages.contains(language), s"$language is not in language list") val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") - scala.io.Source.fromInputStream(is).getLines().toArray + scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray } /** Supported languages list must be lowercase */ diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 89727e74f41e..0c0197fc70e4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -62,7 +62,7 @@ class StopWordsRemoverSuite .setStopWords(stopWords) val dataSet = sqlContext.createDataFrame(Seq( (Seq("test", "test"), Seq()), - (Seq("a", "b", "c", "d"), Seq("b", "c")), + (Seq("a", "b", "c", "d"), Seq("b", "c", "d")), (Seq("a", "the", "an"), Seq()), (Seq("A", "The", "AN"), Seq()), (Seq(null), Seq(null)), From 2e7c54e5c17e7c5672a43ffc28acb207e94bf28a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Wed, 23 Mar 2016 03:42:36 +0200 Subject: [PATCH 12/22] fix pyspark test --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 5025493c42c3..a17c85ae6e67 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1700,8 +1700,8 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords - defaultStopWords = stopWordsObj.English() + stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover + defaultStopWords = stopWordsObj.loadStopWords("english") self._setDefault(stopWords=defaultStopWords, caseSensitive=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) From 7efda40e39663deef0b0884a7bfca13b5d10d706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Wed, 23 Mar 2016 18:51:48 +0200 Subject: [PATCH 13/22] add licence for stop words list --- licenses/LICENCE-postgresql.txt | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 licenses/LICENCE-postgresql.txt diff --git a/licenses/LICENCE-postgresql.txt b/licenses/LICENCE-postgresql.txt new file mode 100644 index 000000000000..515bf9af4d43 --- /dev/null +++ b/licenses/LICENCE-postgresql.txt @@ -0,0 +1,24 @@ +PostgreSQL Database Management System +(formerly known as Postgres, then as Postgres95) + +Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + +Portions Copyright (c) 1994, The Regents of the University of California + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose, without fee, and without a written agreement +is hereby granted, provided that the above copyright notice and this +paragraph and the following two paragraphs appear in all copies. + +IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS +DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS +ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + From a066e8b34ec4824fa26a1e306e197b66400f5ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Thu, 24 Mar 2016 19:12:20 +0200 Subject: [PATCH 14/22] change licence to license --- licenses/{LICENCE-postgresql.txt => LICENSE-postgresql.txt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename licenses/{LICENCE-postgresql.txt => LICENSE-postgresql.txt} (100%) diff --git a/licenses/LICENCE-postgresql.txt b/licenses/LICENSE-postgresql.txt similarity index 100% rename from licenses/LICENCE-postgresql.txt rename to licenses/LICENSE-postgresql.txt From d0f43ace892332dfb3ad25d0ef1d0c0451540e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Fri, 25 Mar 2016 18:23:37 +0200 Subject: [PATCH 15/22] add readme for stopwords list --- .../org/apache/spark/ml/feature/stopwords/README | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100755 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README new file mode 100755 index 000000000000..ec08a5080774 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README @@ -0,0 +1,12 @@ +Stopwords Corpus + +This corpus contains lists of stop words for several languages. These +are high-frequency grammatical words which are usually ignored in text +retrieval applications. + +They were obtained from: +http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ + +The English list has been augmented +https://github.com/nltk/nltk_data/issues/22 + From c017ee235287554e28281d1691d0188e358b7ad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Fri, 25 Mar 2016 18:26:23 +0200 Subject: [PATCH 16/22] merge StopWords into StopWordsRemover --- .../spark/ml/feature/StopWordsRemover.scala | 54 +++++-------------- .../ml/feature/StopWordsRemoverSuite.scala | 27 +++++----- 2 files changed, 28 insertions(+), 53 deletions(-) mode change 100644 => 100755 mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala mode change 100644 => 100755 mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala old mode 100644 new mode 100755 index ab77d9570e94..b991932f3615 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -26,23 +26,6 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructType} -/** - * stop words list - */ -private[spark] object StopWords { - - /** Read stop words list from resources */ - def readStopWords(language: String): Array[String] = { - require(supportedLanguages.contains(language), s"$language is not in language list") - val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") - scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray - } - - /** Supported languages list must be lowercase */ - private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", - "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") -} - /** * :: Experimental :: * A feature transformer that filters out stop words from input. @@ -88,28 +71,11 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getCaseSensitive: Boolean = $(caseSensitive) - /** - * the language of stop words - * Supported languages: Danish, Dutch, English, Finnish, French, German, Hungarian, - * Italian, Norwegian, Portuguese, Russian, Spanish, Swedish, Turkish - * Default: "English" - * @group param - */ - val language: Param[String] = new Param[String](this, "language", "stopwords language") - - /** @group setParam */ - def setLanguage(value: String): this.type = set(language, value.toLowerCase) - - /** @group getParam */ - def getLanguage: String = $(language) - - setDefault(stopWords -> Array.empty[String], - language -> "english", - caseSensitive -> false) + setDefault(stopWords -> Array.empty[String], caseSensitive -> false) override def transform(dataset: DataFrame): DataFrame = { val stopWordsSet = if ($(stopWords).isEmpty) { - StopWords.readStopWords($(language)).toSet + StopWordsRemover.loadStopWords("english").toSet } else { $(stopWords).toSet } @@ -145,13 +111,21 @@ class StopWordsRemover(override val uid: String) @Since("1.6.0") object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { + private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", + "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") + @Since("1.6.0") override def load(path: String): StopWordsRemover = super.load(path) /** - * Stop words for the language - * Supported languages: Danish, Dutch, English, Finnish, French, German, Hungarian, - * Italian, Norwegian, Portuguese, Russian, Spanish, Swedish, Turkish + * Load stop words for the language + * Supported languages: danish, dutch, english, finnish, french, german, hungarian, + * italian, norwegian, portuguese, russian, spanish, swedish, turkish + * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] */ - def loadStopWords(language: String): Array[String] = StopWords.readStopWords(language.toLowerCase) + def loadStopWords(language: String): Array[String] = { + require(supportedLanguages.contains(language), s"$language is not in language list") + val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") + scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala old mode 100644 new mode 100755 index 0c0197fc70e4..0511d1af4db5 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -85,42 +85,43 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } - test("StopWordsRemover with ignored words") { - val stopWords = StopWordsRemover.loadStopWords("english").toSet -- Set("a") + test("StopWordsRemover with language selection") { + val stopWords = StopWordsRemover.loadStopWords("turkish") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setStopWords(stopWords.toArray) + .setStopWords(stopWords) val dataSet = sqlContext.createDataFrame(Seq( - (Seq("python", "scala", "a"), Seq("python", "scala", "a")), - (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) + (Seq("acaba", "ama", "biri"), Seq()), + (Seq("hep", "her", "scala"), Seq("scala")) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) } - test("StopWordsRemover with additional words") { - val stopWords = StopWordsRemover.loadStopWords("english").toSet ++ Set("python", "scala") + test("StopWordsRemover with ignored words") { + val stopWords = StopWordsRemover.loadStopWords("english").toSet -- Set("a") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords.toArray) val dataSet = sqlContext.createDataFrame(Seq( - (Seq("python", "scala", "a"), Seq()), - (Seq("Python", "Scala", "swift"), Seq("swift")) + (Seq("python", "scala", "a"), Seq("python", "scala", "a")), + (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) } - test("StopWordsRemover with language selection") { + test("StopWordsRemover with additional words") { + val stopWords = StopWordsRemover.loadStopWords("english").toSet ++ Set("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setLanguage("turkish") + .setStopWords(stopWords.toArray) val dataSet = sqlContext.createDataFrame(Seq( - (Seq("acaba", "ama", "biri"), Seq()), - (Seq("hep", "her", "scala"), Seq("scala")) + (Seq("python", "scala", "a"), Seq()), + (Seq("Python", "Scala", "swift"), Seq("swift")) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) From 55191ce1f449bed55884a4481071b0fc5ee776a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Fri, 25 Mar 2016 18:27:59 +0200 Subject: [PATCH 17/22] add python stopwords support for language selection --- python/pyspark/ml/feature.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) mode change 100644 => 100755 python/pyspark/ml/feature.py diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py old mode 100644 new mode 100755 index a17c85ae6e67..c9c5dfc6db51 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1700,9 +1700,7 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover - defaultStopWords = stopWordsObj.loadStopWords("english") - self._setDefault(stopWords=defaultStopWords, caseSensitive=False) + self._setDefault(stopWords=self.loadStopWords("english"), caseSensitive=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1748,6 +1746,16 @@ def getCaseSensitive(self): """ return self.getOrDefault(self.caseSensitive) + @staticmethod + def loadStopWords(language): + """ + Load stop words for the language + Supported languages: danish, dutch, english, finnish, french, german, hungarian, + italian, norwegian, portuguese, russian, spanish, swedish, turkish + """ + stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover + return stopWordsObj.loadStopWords(language) + @inherit_doc @ignore_unicode_prefix From 789342f2d26759db180868a9f59b02c8f85cc835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Fri, 25 Mar 2016 18:28:48 +0200 Subject: [PATCH 18/22] add new tests for stopwords --- python/pyspark/ml/tests.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) mode change 100644 => 100755 python/pyspark/ml/tests.py diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py old mode 100644 new mode 100755 index 4da9a373e986..1ae90908341b --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -336,13 +336,20 @@ def test_stopwordsremover(self): self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) - # Custom + # with particular stop words list stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) + # with language selection + stopwords = StopWordsRemover.loadStopWords("turkish") + dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])]) + stopWordRemover.setStopWords(stopwords) + self.assertEqual(stopWordRemover.getStopWords(), stopwords) + transformedDF = stopWordRemover.transform(dataset) + self.assertEqual(transformedDF.head().output, []) class HasInducedError(Params): From 713d4d5e81b2194efa640ec46fa16c56049c00f5 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Mon, 2 May 2016 08:51:31 -0700 Subject: [PATCH 19/22] minor updates --- .../spark/ml/feature/StopWordsRemover.scala | 21 ++++++++++--------- python/pyspark/ml/feature.py | 12 ++++------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 19a89bf7736c..9ac43c103e20 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -45,11 +45,11 @@ class StopWordsRemover(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * the stop words set to be filtered out + * The words to be filtered out. * @group param */ - val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") - setDefault(stopWords -> StopWordsRemover.loadStopWords("english")) + val stopWords: StringArrayParam = + new StringArrayParam(this, "stopWords", "the words to be filtered out") /** @group setParam */ def setStopWords(value: Array[String]): this.type = set(stopWords, value) @@ -58,12 +58,12 @@ class StopWordsRemover(override val uid: String) def getStopWords: Array[String] = $(stopWords) /** - * whether to do a case sensitive comparison over the stop words + * Whether to do a case sensitive comparison over the stop words. * Default: false * @group param */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", - "whether to do case-sensitive comparison during filtering") + "whether to do a case-sensitive comparison over the stop stop words") /** @group setParam */ def setCaseSensitive(value: Boolean): this.type = set(caseSensitive, value) @@ -71,24 +71,23 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getCaseSensitive: Boolean = $(caseSensitive) - setDefault(stopWords -> Array.empty[String], caseSensitive -> false) + setDefault(stopWords -> StopWordsRemover.loadStopWords("english"), caseSensitive -> false) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) - val stopWordsSet = $(stopWords).toSet val t = if ($(caseSensitive)) { + val stopWordsSet = $(stopWords).toSet udf { terms: Seq[String] => terms.filter(s => !stopWordsSet.contains(s)) } } else { val toLower = (s: String) => if (s != null) s.toLowerCase else s - val lowerStopWords = stopWordsSet.map(toLower(_)) + val lowerStopWords = $(stopWords).map(toLower(_)).toSet udf { terms: Seq[String] => terms.filter(s => !lowerStopWords.contains(toLower(s))) } } - val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @@ -118,8 +117,10 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { * italian, norwegian, portuguese, russian, spanish, swedish, turkish * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] */ + @Since("2.0.0") def loadStopWords(language: String): Array[String] = { - require(supportedLanguages.contains(language), s"$language is not in language list") + require(supportedLanguages.contains(language), + s"$language is not in the supported language list: ${supportedLanguages.mkString(", ")}.") val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray } diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 423cb82e9b6f..736043fbad82 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1763,11 +1763,9 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl "comparison over the stop words", typeConverter=TypeConverters.toBoolean) @keyword_only - def __init__(self, inputCol=None, outputCol=None, stopWords=None, - caseSensitive=False): + def __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False): """ - __init__(self, inputCol=None, outputCol=None, stopWords=None,\ - caseSensitive=false) + __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false) """ super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", @@ -1778,11 +1776,9 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, @keyword_only @since("1.6.0") - def setParams(self, inputCol=None, outputCol=None, stopWords=None, - caseSensitive=False): + def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False): """ - setParams(self, inputCol="input", outputCol="output", stopWords=None,\ - caseSensitive=false) + setParams(self, inputCol="input", outputCol="output", stopWords=None, caseSensitive=false) Sets params for this StopWordRemover. """ kwargs = self.setParams._input_kwargs From 9f488fb606315be627ce6e93a15e7a8eda70467f Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Mon, 2 May 2016 09:05:52 -0700 Subject: [PATCH 20/22] fix python tests and add a TODO --- .../scala/org/apache/spark/ml/feature/StopWordsRemover.scala | 3 +++ python/pyspark/ml/feature.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 9ac43c103e20..5e396c0891c5 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -46,6 +46,8 @@ class StopWordsRemover(override val uid: String) /** * The words to be filtered out. + * Default: English stop words + * @see [[StopWordsRemover.loadStopWords()]] * @group param */ val stopWords: StringArrayParam = @@ -82,6 +84,7 @@ class StopWordsRemover(override val uid: String) terms.filter(s => !stopWordsSet.contains(s)) } } else { + // TODO: support user locale (SPARK-15064) val toLower = (s: String) => if (s != null) s.toLowerCase else s val lowerStopWords = $(stopWords).map(toLower(_)).toSet udf { terms: Seq[String] => diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 736043fbad82..af9a1f29c46f 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1815,6 +1815,7 @@ def getCaseSensitive(self): return self.getOrDefault(self.caseSensitive) @staticmethod + @since("2.0.0") def loadStopWords(language): """ Load stop words for the language @@ -1822,7 +1823,7 @@ def loadStopWords(language): italian, norwegian, portuguese, russian, spanish, swedish, turkish """ stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover - return stopWordsObj.loadStopWords(language) + return list(stopWordsObj.loadStopWords(language)) @inherit_doc From e2d0aba512fb2160656ce716a4f042b9a5dca032 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 4 May 2016 09:00:25 -0700 Subject: [PATCH 21/22] address feedback --- .../spark/ml/feature/StopWordsRemover.scala | 13 +++++++------ .../ml/feature/StopWordsRemoverSuite.scala | 13 ++++++++++--- python/pyspark/ml/feature.py | 18 +++++++++--------- python/pyspark/ml/tests.py | 2 +- 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 5e396c0891c5..11864cb8f439 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -47,7 +47,7 @@ class StopWordsRemover(override val uid: String) /** * The words to be filtered out. * Default: English stop words - * @see [[StopWordsRemover.loadStopWords()]] + * @see [[StopWordsRemover.loadDefaultStopWords()]] * @group param */ val stopWords: StringArrayParam = @@ -65,7 +65,7 @@ class StopWordsRemover(override val uid: String) * @group param */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", - "whether to do a case-sensitive comparison over the stop stop words") + "whether to do a case-sensitive comparison over the stop words") /** @group setParam */ def setCaseSensitive(value: Boolean): this.type = set(caseSensitive, value) @@ -73,7 +73,7 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getCaseSensitive: Boolean = $(caseSensitive) - setDefault(stopWords -> StopWordsRemover.loadStopWords("english"), caseSensitive -> false) + setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"), caseSensitive -> false) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { @@ -108,20 +108,21 @@ class StopWordsRemover(override val uid: String) @Since("1.6.0") object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { - private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", + private[feature] + val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") @Since("1.6.0") override def load(path: String): StopWordsRemover = super.load(path) /** - * Load stop words for the language + * Loads the default stop words for the given language. * Supported languages: danish, dutch, english, finnish, french, german, hungarian, * italian, norwegian, portuguese, russian, spanish, swedish, turkish * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] */ @Since("2.0.0") - def loadStopWords(language: String): Array[String] = { + def loadDefaultStopWords(language: String): Array[String] = { require(supportedLanguages.contains(language), s"$language is not in the supported language list: ${supportedLanguages.mkString(", ")}.") val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 99b353addb1f..8e7e000fbc11 100755 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -85,8 +85,15 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } + test("default stop words of supported languages are not empty") { + StopWordsRemover.supportedLanguages.foreach { lang => + assert(StopWordsRemover.loadDefaultStopWords(lang).nonEmpty, + s"The default stop words of $lang cannot be empty.") + } + } + test("StopWordsRemover with language selection") { - val stopWords = StopWordsRemover.loadStopWords("turkish") + val stopWords = StopWordsRemover.loadDefaultStopWords("turkish") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") @@ -100,7 +107,7 @@ class StopWordsRemoverSuite } test("StopWordsRemover with ignored words") { - val stopWords = StopWordsRemover.loadStopWords("english").toSet -- Set("a") + val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet -- Set("a") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") @@ -114,7 +121,7 @@ class StopWordsRemoverSuite } test("StopWordsRemover with additional words") { - val stopWords = StopWordsRemover.loadStopWords("english").toSet ++ Set("python", "scala") + val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet ++ Set("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index af9a1f29c46f..fc8f054ccdaa 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1778,7 +1778,7 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive= @since("1.6.0") def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False): """ - setParams(self, inputCol="input", outputCol="output", stopWords=None, caseSensitive=false) + setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=false) Sets params for this StopWordRemover. """ kwargs = self.setParams._input_kwargs @@ -1787,7 +1787,7 @@ def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive @since("1.6.0") def setStopWords(self, value): """ - Specify the stopwords to be filtered. + Sets the value of :py:attr:`stopWords`. """ self._set(stopWords=value) return self @@ -1795,14 +1795,14 @@ def setStopWords(self, value): @since("1.6.0") def getStopWords(self): """ - Get the stopwords. + Gets the value of :py:attr:`stopWords` or its default value. """ return self.getOrDefault(self.stopWords) @since("1.6.0") def setCaseSensitive(self, value): """ - Set whether to do a case sensitive comparison over the stop words + Sets the value of :py:attr:`caseSensitive`. """ self._set(caseSensitive=value) return self @@ -1810,20 +1810,20 @@ def setCaseSensitive(self, value): @since("1.6.0") def getCaseSensitive(self): """ - Get whether to do a case sensitive comparison over the stop words. + Gets the value of :py:attr:`caseSensitive` or its default value. """ return self.getOrDefault(self.caseSensitive) @staticmethod @since("2.0.0") - def loadStopWords(language): + def loadDefaultStopWords(language): """ - Load stop words for the language + Loads the default stop words for the given language. Supported languages: danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese, russian, spanish, swedish, turkish """ stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover - return list(stopWordsObj.loadStopWords(language)) + return list(stopWordsObj.loadDefaultStopWords(language)) @inherit_doc @@ -1875,7 +1875,7 @@ def __init__(self, inputCol=None, outputCol=None): @since("1.3.0") def setParams(self, inputCol=None, outputCol=None): """ - setParams(self, inputCol="input", outputCol="output") + setParams(self, inputCol=None, outputCol=None) Sets params for this Tokenizer. """ kwargs = self.setParams._input_kwargs diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index e3a82453644b..ad1631fb5baa 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -418,7 +418,7 @@ def test_stopwordsremover(self): transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection - stopwords = StopWordsRemover.loadStopWords("turkish") + stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) From df2d98f6951c360c950c6c8c5625f9f8d7ec95bf Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Thu, 5 May 2016 14:13:59 -0700 Subject: [PATCH 22/22] fix python test --- python/pyspark/ml/feature.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index fc8f054ccdaa..62a293184224 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1770,7 +1770,8 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive= super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - self._setDefault(stopWords=StopWordsRemover.loadStopWords("english"), caseSensitive=False) + self._setDefault(stopWords=StopWordsRemover.loadDefaultStopWords("english"), + caseSensitive=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs)