diff --git a/libhdt/include/Dictionary.hpp b/libhdt/include/Dictionary.hpp index b9b4b7c8..806ba8b3 100644 --- a/libhdt/include/Dictionary.hpp +++ b/libhdt/include/Dictionary.hpp @@ -132,6 +132,7 @@ class Dictionary virtual size_t getMaxObjectID()=0; virtual void import(Dictionary *other, ProgressListener *listener=NULL)=0; + virtual void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL)=0; virtual IteratorUCharString *getSubjects()=0; virtual IteratorUCharString *getPredicates()=0; diff --git a/libhdt/include/HDTEnums.hpp b/libhdt/include/HDTEnums.hpp index 203991c8..5323d9b2 100644 --- a/libhdt/include/HDTEnums.hpp +++ b/libhdt/include/HDTEnums.hpp @@ -125,6 +125,11 @@ enum ResultEstimationType { EXACT }; +enum LoaderType { + ONE_PASS, + TWO_PASS +}; + } #endif /* HDT_HDTENUMS_HPP_ */ diff --git a/libhdt/include/HDTManager.hpp b/libhdt/include/HDTManager.hpp index 57911a75..293473e5 100644 --- a/libhdt/include/HDTManager.hpp +++ b/libhdt/include/HDTManager.hpp @@ -89,7 +89,7 @@ class HDTManager { * @throws IOException * @throws ParserException */ - static HDT *generateHDT(const char *rdfFileName, const char *baseURI, RDFNotation rdfNotation, HDTSpecification &hdtFormat, ProgressListener *listener=NULL); + static HDT *generateHDT(const char *rdfFileName, const char *baseURI, RDFNotation rdfNotation, HDTSpecification &hdtFormat, ProgressListener *listener=NULL, LoaderType LoaderType = LoaderType::TWO_PASS); }; } diff --git a/libhdt/src/dictionary/FourSectionDictionary.cpp b/libhdt/src/dictionary/FourSectionDictionary.cpp index 8bf6dc2b..60da213e 100644 --- a/libhdt/src/dictionary/FourSectionDictionary.cpp +++ b/libhdt/src/dictionary/FourSectionDictionary.cpp @@ -307,6 +307,21 @@ void FourSectionDictionary::import(Dictionary *other, ProgressListener *listener } } +void FourSectionDictionary::import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener) { + + this->import(other, listener); + + // Update all IDs according to new dictionary + IteratorTripleID *it = triplesList->searchAll(); + while(it->hasNext()){ + TripleID *tripleID = it->next(); + TripleString *triple = new TripleString(); + other->tripleIDtoTripleString(*tripleID, *triple); + this->tripleStringtoTripleID(*triple, *tripleID); + delete triple; + } +} + IteratorUCharString *FourSectionDictionary::getSubjects() { return subjects->listAll(); } diff --git a/libhdt/src/dictionary/FourSectionDictionary.hpp b/libhdt/src/dictionary/FourSectionDictionary.hpp index f8f55060..042c9be4 100644 --- a/libhdt/src/dictionary/FourSectionDictionary.hpp +++ b/libhdt/src/dictionary/FourSectionDictionary.hpp @@ -83,6 +83,7 @@ class FourSectionDictionary : public Dictionary { size_t load(unsigned char *ptr, unsigned char *ptrMax, ProgressListener *listener=NULL); void import(Dictionary *other, ProgressListener *listener=NULL); + void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL); IteratorUCharString *getSubjects(); IteratorUCharString *getPredicates(); diff --git a/libhdt/src/dictionary/KyotoDictionary.cpp b/libhdt/src/dictionary/KyotoDictionary.cpp index f757d08f..08f79685 100644 --- a/libhdt/src/dictionary/KyotoDictionary.cpp +++ b/libhdt/src/dictionary/KyotoDictionary.cpp @@ -328,6 +328,10 @@ void KyotoDictionary::import(Dictionary *other, ProgressListener *listener) { throw std::logic_error("Not implemented"); } +void KyotoDictionary::import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener) { + throw std::logic_error("Not implemented import"); +} + IteratorUCharString *KyotoDictionary::getSubjects() { return new KyotoDictIterator(&this->subjects); } diff --git a/libhdt/src/dictionary/KyotoDictionary.hpp b/libhdt/src/dictionary/KyotoDictionary.hpp index bfa22df8..00db497b 100644 --- a/libhdt/src/dictionary/KyotoDictionary.hpp +++ b/libhdt/src/dictionary/KyotoDictionary.hpp @@ -93,6 +93,7 @@ class KyotoDictionary : public ModifiableDictionary { size_t load(unsigned char *ptr, unsigned char *ptrMax, ProgressListener *listener=NULL); void import(Dictionary *other, ProgressListener *listener=NULL); + void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL); IteratorUCharString *getSubjects(); IteratorUCharString *getPredicates(); diff --git a/libhdt/src/dictionary/LiteralDictionary.cpp b/libhdt/src/dictionary/LiteralDictionary.cpp index e9b1c019..7c2a8683 100644 --- a/libhdt/src/dictionary/LiteralDictionary.cpp +++ b/libhdt/src/dictionary/LiteralDictionary.cpp @@ -392,6 +392,10 @@ void LiteralDictionary::import( Dictionary *other, ProgressListener *listener) { } } +void LiteralDictionary::import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener) { + throw std::logic_error("Not implemented import"); +} + IteratorUCharString *LiteralDictionary::getSubjects() { throw std::logic_error("Not implemented"); } diff --git a/libhdt/src/dictionary/LiteralDictionary.hpp b/libhdt/src/dictionary/LiteralDictionary.hpp index 979fd4b7..f6cc75e9 100644 --- a/libhdt/src/dictionary/LiteralDictionary.hpp +++ b/libhdt/src/dictionary/LiteralDictionary.hpp @@ -96,6 +96,7 @@ class LiteralDictionary : public Dictionary { size_t load(unsigned char *ptr, unsigned char *ptrMax, ProgressListener *listener=NULL); void import(Dictionary *other, ProgressListener *listener=NULL); + void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL); IteratorUCharString *getSubjects(); IteratorUCharString *getPredicates(); diff --git a/libhdt/src/dictionary/PlainDictionary.cpp b/libhdt/src/dictionary/PlainDictionary.cpp index d5eaf8db..843224df 100644 --- a/libhdt/src/dictionary/PlainDictionary.cpp +++ b/libhdt/src/dictionary/PlainDictionary.cpp @@ -256,6 +256,10 @@ void PlainDictionary::import(Dictionary *other, ProgressListener *listener) { throw std::logic_error("Not implemented import"); } +void PlainDictionary::import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener) { + throw std::logic_error("Not implemented import"); +} + IteratorUCharString *PlainDictionary::getSubjects() { return new DictIterator(this->subjects); } @@ -316,17 +320,22 @@ size_t PlainDictionary::insert(const std::string & str, TripleComponentRole pos) DictionaryEntry *entry = new DictionaryEntry; entry->str = new char [str.length()+1]; strcpy(entry->str, str.c_str()); + entry->id = subjects.size()+1; sizeStrings += str.length(); //cout << " Add new subject: " << str << endl; hashSubject[entry->str] = entry; + subjects.push_back(entry); + return entry->id; } else if(foundSubject) { // Already exists in subjects. //cout << " existing subject: " << str << endl; + return subjectIt->second->id; } else if(foundObject) { // Already exists in objects. //cout << " existing subject as object: " << str << endl; hashSubject[objectIt->second->str] = objectIt->second; + return objectIt->second->id; } } else if(pos==OBJECT) { if(!foundSubject && !foundObject) { @@ -334,22 +343,24 @@ size_t PlainDictionary::insert(const std::string & str, TripleComponentRole pos) DictionaryEntry *entry = new DictionaryEntry; entry->str = new char [str.length()+1]; strcpy(entry->str, str.c_str()); + entry->id = objects.size()+1; sizeStrings += str.length(); //cout << " Add new object: " << str << endl; hashObject[entry->str] = entry; + objects.push_back(entry); + return entry->id; } else if(foundObject) { // Already exists in objects. //cout << " existing object: " << str << endl; + return objectIt->second->id; } else if(foundSubject) { // Already exists in subjects. //cout << " existing object as subject: " << str << endl; hashObject[subjectIt->second->str] = subjectIt->second; + return subjectIt->second->id; } } - - // FIXME: Return inserted index? - return 0; } string intToStr(int val) { diff --git a/libhdt/src/dictionary/PlainDictionary.hpp b/libhdt/src/dictionary/PlainDictionary.hpp index a58114af..6f187a89 100644 --- a/libhdt/src/dictionary/PlainDictionary.hpp +++ b/libhdt/src/dictionary/PlainDictionary.hpp @@ -145,6 +145,7 @@ class PlainDictionary : public ModifiableDictionary { size_t load(unsigned char *ptr, unsigned char *ptrMax, ProgressListener *listener=NULL); void import(Dictionary *other, ProgressListener *listener=NULL); + void import(Dictionary *other, ModifiableTriples *triplesList, ProgressListener *listener=NULL); IteratorUCharString *getSubjects(); IteratorUCharString *getPredicates(); diff --git a/libhdt/src/hdt/BasicHDT.cpp b/libhdt/src/hdt/BasicHDT.cpp index 1ed2ce5e..620dde77 100644 --- a/libhdt/src/hdt/BasicHDT.cpp +++ b/libhdt/src/hdt/BasicHDT.cpp @@ -98,7 +98,7 @@ void BasicHDT::createComponents() { std::string dictType = ""; try{ - spec.get("dictionary.type"); + dictType = spec.get("dictionary.type"); } catch (std::exception& e){ } @@ -399,7 +399,7 @@ void BasicHDT::fillHeader(const string& baseUri) { header->insert(publicationInfoNode, HDTVocabulary::DUBLIN_CORE_ISSUED, date); } -void BasicHDT::loadFromRDF(const char *fileName, string baseUri, RDFNotation notation, ProgressListener *listener) +void BasicHDT::loadFromRDF(const char *fileName, string baseUri, RDFNotation notation, ProgressListener *listener, LoaderType loaderType) { try { // Make sure that URI starts and ends with <> @@ -410,11 +410,20 @@ void BasicHDT::loadFromRDF(const char *fileName, string baseUri, RDFNotation not IntermediateListener iListener(listener); - iListener.setRange(0,50); - loadDictionary(fileName, baseUri.c_str(), notation, &iListener); - - iListener.setRange(50,99); - loadTriples(fileName, baseUri.c_str(), notation, &iListener); + switch(loaderType) { + case ONE_PASS: + iListener.setRange(0,99); + loadOnePass(fileName, baseUri.c_str(), notation, &iListener); + break; + + case TWO_PASS: + default: + iListener.setRange(0,50); + loadDictionary(fileName, baseUri.c_str(), notation, &iListener); + + iListener.setRange(50,99); + loadTriples(fileName, baseUri.c_str(), notation, &iListener); + } fillHeader(baseUri); @@ -964,4 +973,105 @@ void BasicHDT::saveIndex(ProgressListener *listener) { out.close(); } +/* ONE PASS logic */ + +void BasicHDT::loadOnePass(const char* fileName, const char* baseUri, RDFNotation notation, ProgressListener* listener) { + + StopWatch st; + IntermediateListener iListener(listener); + + // Create temporary dictionary + ModifiableDictionary *dict = getLoadDictionary(); + ModifiableTriples* triplesList = new TriplesList(spec); + + try { + NOTIFY(listener, "Loading Dictionary & Triples", 0, 100); + iListener.setRange(0, 80); + + dict->startProcessing(); + triplesList->startProcessing(); + + // Load data + OnePassLoader loader(dict, triplesList, &iListener); + + RDFParserCallback *parser = RDFParserCallback::getParserCallback(notation); + parser->doParse(fileName, baseUri, notation, true, &loader); + delete parser; + + header->insert("_:statistics", HDTVocabulary::ORIGINAL_SIZE, loader.getSize()); + iListener.setRange(80, 90); + + dict->stopProcessing(&iListener); + triplesList->stopProcessing(&iListener); + + // Convert to final format + if (dictionary->getType()!=HDTVocabulary::DICTIONARY_TYPE_PLAIN){ + dictionary->import(dict, triplesList); + + //TODO: Update Ids triples according to new ID. + + delete dict; + } + else{ + dictionary = dict; + } +#ifndef WIN32 + } catch (char *e) { + cout << "Catch exception dictionary/triples: " << e << endl; + delete dict; + delete triplesList; + throw e; +#else + } catch(exception& e) { + cerr << "caught here??" << endl; + delete dict; + delete triplesList; + throw; +#endif + } + + if (triples->getType() == triplesList->getType()) { + delete triples; + triples = triplesList; + } else { + iListener.setRange(90, 100); + try { + triples->load(*triplesList, &iListener); + } catch (std::exception& e) { + delete triplesList; + throw; + } + delete triplesList; + } +} + +void OnePassLoader::processTriple(const hdt::TripleString& triple, unsigned long long pos) { + + TripleID ti = TripleID( + dictionary->insert(triple.getSubject(), SUBJECT), + dictionary->insert(triple.getPredicate(), PREDICATE), + dictionary->insert(triple.getObject(), OBJECT) + ); + + if (ti.isValid()) { + triples->insert(ti); + } else { + stringstream msg; + msg << "ERROR: Could not convert triple to IDS! " << endl << triple << endl << ti; + throw ParseException(msg.str()); + } + //cerr << "TripleID: " << ti << endl; + char str[100]; + if ((listener != NULL) && (count % 100000) == 0) { + sprintf(str, "Generating Triples: %lld K triples processed.", count / 1000); + listener->notifyProgress(0, str); + } + count++; + if(pos>sizeBytes) { + sizeBytes = pos; + } +} + + + } diff --git a/libhdt/src/hdt/BasicHDT.hpp b/libhdt/src/hdt/BasicHDT.hpp index 23145ade..5ff95e12 100644 --- a/libhdt/src/hdt/BasicHDT.hpp +++ b/libhdt/src/hdt/BasicHDT.hpp @@ -54,6 +54,7 @@ class BasicHDT : public HDT { void loadDictionary(const char *fileName, const char *baseUri, RDFNotation notation, ProgressListener *listener); void loadTriples(const char *fileName, const char *baseUri, RDFNotation notation, ProgressListener *listener); + void loadOnePass(const char *fileName, const char *baseUri, RDFNotation notation, ProgressListener *listener); void addDictionaryFromHDT(const char *fileName, ModifiableDictionary *dict, ProgressListener *listener=NULL); void loadDictionaryFromHDTs(const char** fileName, size_t numFiles, const char* baseUri, ProgressListener* listener=NULL); @@ -86,7 +87,7 @@ class BasicHDT : public HDT { */ Triples *getTriples(); - void loadFromRDF(const char *fileName, string baseUri, RDFNotation notation, ProgressListener *listener = NULL); + void loadFromRDF(const char *fileName, string baseUri, RDFNotation notation, ProgressListener *listener = NULL, LoaderType loaderType = LoaderType::TWO_PASS); /** * @param input @@ -169,6 +170,24 @@ class TriplesLoader : public RDFCallback { } }; +class OnePassLoader : public RDFCallback { +private: + ModifiableDictionary *dictionary; + ModifiableTriples *triples; + ProgressListener *listener; + unsigned long long count; + uint64_t sizeBytes; +public: + OnePassLoader(ModifiableDictionary *dictionary, ModifiableTriples *triples, ProgressListener *listener) : dictionary(dictionary), triples(triples), listener(listener), count(0), sizeBytes(0) { } + void processTriple(const TripleString &triple, unsigned long long pos); + uint64_t getSize() { + return sizeBytes; + } + inline unsigned long long getCount() { + return count; + } +}; + } #endif /* BASICHDT_HPP_ */ diff --git a/libhdt/src/hdt/HDTManager.cpp b/libhdt/src/hdt/HDTManager.cpp index df942ae0..372fe7dc 100644 --- a/libhdt/src/hdt/HDTManager.cpp +++ b/libhdt/src/hdt/HDTManager.cpp @@ -75,9 +75,9 @@ HDT *HDTManager::indexedHDT(HDT *hdt, ProgressListener *listener){ return bhdt; } -HDT *HDTManager::generateHDT(const char *rdfFileName, const char *baseURI, RDFNotation rdfNotation, HDTSpecification &hdtFormat, ProgressListener *listener){ +HDT *HDTManager::generateHDT(const char *rdfFileName, const char *baseURI, RDFNotation rdfNotation, HDTSpecification &hdtFormat, ProgressListener *listener, LoaderType loaderType){ BasicHDT *hdt = new BasicHDT(hdtFormat); - hdt->loadFromRDF(rdfFileName, baseURI, rdfNotation, listener); + hdt->loadFromRDF(rdfFileName, baseURI, rdfNotation, listener, loaderType); return hdt; } diff --git a/libhdt/tools/rdf2hdt.cpp b/libhdt/tools/rdf2hdt.cpp index 2f60776c..5669656f 100644 --- a/libhdt/tools/rdf2hdt.cpp +++ b/libhdt/tools/rdf2hdt.cpp @@ -52,6 +52,7 @@ void help() { cout << "\t-c\t\tHDT Config options file" << endl; cout << "\t-o\t\tHDT Additional options (option1=value1;option2=value2;...)" << endl; cout << "\t-f\t\tFormat of the RDF input (nquads,nq,ntriples,nt,trig,turtle,ttl)" << endl; + cout << "\t-l\t\tIndicate whether 1 or 2 passes are used to create the HDT (default 2)" << endl; cout << "\t-B\t\"\"\tBase URI of the dataset." << endl; cout << "\t-V\tPrints the HDT version number." << endl; cout << "\t-p\tPrints a progress indicator." << endl; @@ -68,6 +69,7 @@ int main(int argc, char **argv) { string options; string rdfFormat; string baseUri; + string lType; /** * Input file format. If no -f is specified and we can't guess which @@ -76,7 +78,7 @@ int main(int argc, char **argv) { RDFNotation notation = NTRIPLES; int flag; - while ((flag = getopt (argc, argv, "c:o:vpf:B:iVh")) != -1) + while ((flag = getopt (argc, argv, "c:o:vpfl:B:iVh")) != -1) { switch (flag) { @@ -95,6 +97,9 @@ int main(int argc, char **argv) { case 'f': rdfFormat = optarg; break; + case 'l': + lType = optarg; + break; case 'B': baseUri = optarg; break; @@ -204,6 +209,10 @@ int main(int argc, char **argv) { vout << "Detected RDF input format: " << rdfFormat << endl; + // Detect loader type + LoaderType loaderType = lType == "1" ? ONE_PASS : TWO_PASS; + vout << "Detected Loader type: " << (loaderType + 1) << "-pass" << endl; + // Process HDTSpecification spec(configFile); @@ -214,7 +223,7 @@ int main(int argc, char **argv) { StopWatch globalTimer; ProgressListener* progress = showProgress ? new StdoutProgressListener() : NULL; - HDT *hdt = HDTManager::generateHDT(inputFile.c_str(), baseUri.c_str(), notation, spec, progress); + HDT *hdt = HDTManager::generateHDT(inputFile.c_str(), baseUri.c_str(), notation, spec, progress, loaderType); ofstream out;