diff --git a/books.cpp b/books.cpp index 4c7cd3f..c6a30e0 100644 --- a/books.cpp +++ b/books.cpp @@ -1,6 +1,3 @@ -#define MIN_CITATION_LENGTH 10 -// FIXME: static arrays are too big here, find a dynamic solution - #include #include #include @@ -380,10 +377,11 @@ int lookupTranslation(string moduleName, string book, const string& verse) { return 0; } -int addBook_cached(string moduleName) { +/// Loaded all books of a Bible edition. +int addBooks_cached(string moduleName) { vector bookNames; // This is needed for correct alphabetical ordering: - if (isOTBook(moduleName)) { + if (isOTBook(moduleName)) { // These books should be loaded from the Old Testament... bookNames={"Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "Joshua", "Judges", "Ruth", "I_Samuel", "II_Samuel", "I_Kings", "II_Kings", "I_Chronicles", "II_Chronicles", @@ -392,7 +390,7 @@ int addBook_cached(string moduleName) { "Hosea", "Joel", "Amos", "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah", "Malachi"}; } - if (isNTBook(moduleName)) { + if (isNTBook(moduleName)) { // These books should be loaded from the New Testament... bookNames={"Matthew", "Mark", "Luke", "John", "Acts", "Romans", "I_Corinthians", "II_Corinthians", "Galatians", "Ephesians", "Philippians", "Colossians", @@ -405,25 +403,25 @@ int addBook_cached(string moduleName) { "Revelation_of_John"}; } - string path = "bibref-addbooks-cache/" + moduleName; + string path = "bibref-addbooks-cache/" + moduleName; // hardcoded path - PsalmsInfo pi = PsalmsInfo(moduleName); + PsalmsInfo pi = PsalmsInfo(moduleName); // create database for Psalms for (int i=1; i<=151; i++) pi.setLastVerse(i, 0); // initialize for (vector::iterator i=bookNames.begin(); i!=bookNames.end(); ++i) { string bookName = *i; - std::ifstream bookFile(path + "/" + bookName + ".book"); + std::ifstream bookFile(path + "/" + bookName + ".book"); // open a-y encoded raw book std::stringstream buffer; buffer << bookFile.rdbuf(); Book book = Book(bookName); book.setText(string(buffer.str())); - std::ifstream tokensFile(path + "/" + bookName + ".tokens"); + std::ifstream tokensFile(path + "/" + bookName + ".tokens"); // open tokens for the given book vector tokens; int t; while (tokensFile >> t) { - tokens.push_back(t); + tokens.push_back(t); // read tokens } book.setTokens(tokens); @@ -445,25 +443,25 @@ int addBook_cached(string moduleName) { error("Data loading error from cache file " + verseFileName + "."); error("Cache may be corrupt or incompatible with this version. Consider removing it, then retry."); error("To avoid data corruption, bibref exits now. Sorry for any inconveniences."); - exit(1); + exit(1); // Inform the user and exit, since this is a fatal error. } fclose(verseFile); - add_vocabulary_item(bookName); - book.setModuleName(moduleName); + add_vocabulary_item(bookName); // add readline entry for this book + book.setModuleName(moduleName); // put this book in the given Bible edition books.push_back(book); } - psalmsInfos.push_back(pi); + psalmsInfos.push_back(pi); // store Psalm database info("Done loading books of " + moduleName + " (cached)."); - return 0; + return 0; // Success! } /// Load a Bible edition, and save it on the disk for a future cache (if there is no cache saved yet). -int addBook(string moduleName, string firstVerse, string lastVerse, bool removeAccents) { +int addBooks(string moduleName, string firstVerse, string lastVerse, bool removeAccents) { DIR* cache_dir = opendir(("bibref-addbooks-cache/" + moduleName).c_str()); // This is hardcoded. if (cache_dir) { // If there is a cache saved on the disk, we'll use it. closedir(cache_dir); - addBook_cached(moduleName); // Load the book from the cache. + addBooks_cached(moduleName); // Load the book from the cache. return 0; // No further operation is required. } SWMgr library(new MarkupFilterMgr(FMT_PLAIN)); @@ -621,15 +619,15 @@ int addBook(string moduleName, string firstVerse, string lastVerse, bool removeA return 0; // Success! } -int addBooks() { +int addBibles() { int success = 0; - if (addBook("LXX", "Genesis 1:1", "Malachi 4:6", false) !=0 ) { + if (addBooks("LXX", "Genesis 1:1", "Malachi 4:6", false) !=0 ) { success = 1; } - if (addBook("SBLGNT", "Matthew 1:1", "Revelation of John 22:21", true) != 0) { + if (addBooks("SBLGNT", "Matthew 1:1", "Revelation of John 22:21", true) != 0) { success = 1; } - if (addBook("StatResGNT", "Matthew 1:1", "Revelation of John 22:21", true) != 0) { + if (addBooks("StatResGNT", "Matthew 1:1", "Revelation of John 22:21", true) != 0) { success = 1; } return success; @@ -723,6 +721,10 @@ int compare(string verse1, string verse2) { return d; } + +/* The following part is experimental code and currently unmaintained. */ + +/* typedef struct FingerprintInfo { Fingerprint m_fp; Book *m_book; @@ -750,6 +752,9 @@ typedef struct Int2 { int m_coords[2]; } Int2; +#define MIN_CITATION_LENGTH 10 +// FIXME: static arrays are too big here, find a dynamic solution + int findBestFit(const string& book1, const string& info1, const string& verseInfo1s, const string& verseInfo1e, const string& book2, const string& info2, const string& verseInfo2s, const string& verseInfo2e) { info("Comparing " + book1 + " (" + info1 + ") " + verseInfo1s + "-" + verseInfo1e + " and " @@ -852,40 +857,47 @@ int findBestFit(const string& book1, const string& info1, const string& verseInf return 0; } +*/ + +/* End of experimental code. */ + +/// Low level algorithm to find exact string match in the a-y encoded books. +/// @return "f,b,p" where f is the number of occurrences, b is the book name of the last occurrence and p is its position. string _find(string text, string moduleName, int maxFound, bool verbose) { - int found = 0; - size_t pos; - string book; - for (int i=0; i info; @@ -893,6 +905,8 @@ int find(const string& text, const string& moduleName, int maxFound, bool verbos return stoi(info[0]); } +/// Return the book name and position of the last occurrence of an a-y encoded text in Bible edition moduleName. +/// @return "b,p" where b is the book name of the last occurrence and p is its position. string find(const string& text, const string& moduleName) { string f = _find(text, moduleName, 1, 0); vector info; @@ -901,11 +915,11 @@ string find(const string& text, const string& moduleName) { } vector find_min_unique(string text, const string& moduleName, bool verbose) { - int long_limit = 10000; - int extreme_limit = 50000; + int long_limit = 10000; // below this the algorithm should work fast enough + int extreme_limit = 50000; // above this there is no hope to get a result because of out of memory error vector retval; int l = text.length(); - if (l > extreme_limit) { + if (l > extreme_limit) { // warn the user, but don't stop error("Input is extremely long (" + to_string(l) + " characters), expect out of memory error."); } else if (l > long_limit) { error("Input is very long (" + to_string(l) + " characters), expect very slow operation."); @@ -924,61 +938,62 @@ vector find_min_unique(string text, const string& moduleName, bool verbo for (int i = 0; i < l; ++i) { for (int j = 0; j < l - i; ++j) { if (i > 0 && (is_unique[i - 1][j + 1] > 0 || is_unique[i - 1][j] > 0)) { - is_unique[i][j] = 2; + is_unique[i][j] = 2; // this entry can already be ignored } else { string subtext = text.substr(j, i+1); - int unique = find(subtext, moduleName, 2, 0); + int unique = find(subtext, moduleName, 2, 0); // decide if this subtext is unique if (unique == 1) { - is_unique[i][j] = 1; + is_unique[i][j] = 1; // if yes, fill in the database if (verbose) { - info("Text " + subtext + " is minimal unique."); + info("Text " + subtext + " is minimal unique."); // inform the user if needed } - retval.push_back(subtext); + retval.push_back(subtext); // store this result } else { - is_unique[i][j] = 0; + is_unique[i][j] = 0; // if no, fill in the database } } } } - return retval; + return retval; // return the list of minimally unique subtexts } +/// Low level algorithm to extend a passage that is unique in another edition to be maximally long, and keep verbatim equality. string _extend(const string& moduleName1, const string& moduleName2, const string& book2, int pos2S, int pos2E, bool verbose) { - Book b2 = getBook(book2, moduleName2); - string text = b2.getText().substr(pos2S, pos2E - pos2S + 1); + Book b2 = getBook(book2, moduleName2); // the book where the passage will be extended + string text = b2.getText().substr(pos2S, pos2E - pos2S + 1); // the passage to extend // checking the input if (find(text, moduleName1, 2, 0) != 1) { - throw NoCitationException; + throw NoCitationException; // this passage is not present in the assumed Bible edition, that's an error } bool citation = true; string found = find(text, moduleName1); vector info1; boost::split(info1, found, boost::is_any_of(",")); - string book1 = info1[0]; + string book1 = info1[0]; // last occurrence of passage in the other edition is in book1 Book b1 = getBook(book1, moduleName1); - int pos1S = stoi(info1[1]); + int pos1S = stoi(info1[1]); // last position of passage in the other edition is pos1S - string text1 = b1.getText(); - string text2 = b2.getText(); + string text1 = b1.getText(); // the whole Bible text of the other edition + string text2 = b2.getText(); // the whole Bible text of the edition of the passage - while (citation && pos1S > 0 && pos2S > 0) { + while (citation && pos1S > 0 && pos2S > 0) { // shift left in both Bible editions pos1S--; pos2S--; - citation = text1.at(pos1S) == text2.at(pos2S); + citation = text1.at(pos1S) == text2.at(pos2S); // until there is no plausible verbatim citation anymore } - pos1S++; - pos2S++; + pos1S++; // fix start pointer for the other edition + pos2S++; // fix start pointer for the edition of the input passage citation = true; int pos1E = pos1S + pos2E - pos2S; - while (citation && pos1E < text1.length() - 1 && pos2E < text2.length() - 1) { + while (citation && pos1E < text1.length() - 1 && pos2E < text2.length() - 1) { // shift right in both editions pos1E++; pos2E++; citation = text1.at(pos1E) == text2.at(pos2E); } - pos1E--; - pos2E--; + pos1E--; // fix end pointer for the other edition + pos2E--; // fix end pointer for the edition of the input passage string verse1infoS = b1.getVerseInfoStart(pos1S); string verse1infoE = b1.getVerseInfoEnd(pos1E); string verse2infoS = b2.getVerseInfoStart(pos2S); @@ -988,12 +1003,12 @@ string _extend(const string& moduleName1, const string& moduleName2, const strin + verse1infoS + " " + verse1infoE + reset_color + " = " + nt_color + moduleName2 + " " + book2 + " " + verse2infoS + " " + verse2infoE + reset_color + " (" + ot_color + text1.substr(pos1S, pos1E - pos1S + 1) + reset_color + ", length " - + to_string(pos1E - pos1S + 1) + ")."); + + to_string(pos1E - pos1S + 1) + ")."); // report info verbosely } return ot_color + moduleName1 + " " + book1 + " " + verse1infoS + " " + verse1infoE + reset_color + " = " + nt_color + moduleName2 + " " + book2 + " " + verse2infoS + " " + verse2infoE + reset_color + "," - + to_string(pos1S) + "," + to_string(pos1E - pos1S + 1) + "," + to_string(pos2S); + + to_string(pos1S) + "," + to_string(pos1E - pos1S + 1) + "," + to_string(pos2S); // return concise report } void extend(const string& moduleName1, const string& moduleName2, const string& book2, const string& verse2S, @@ -1048,37 +1063,37 @@ bool equalReference(Reference r1, Reference r2) { void getrefs(const string& moduleName2, const string& moduleName1, const string& book1, const string& verse1S, int start, const string& verse1E, int end) { - vector refs; - Book b1 = getBook(book1, moduleName1); - int pos1S = b1.getVerseStart(verse1S) + start; - int pos1E = b1.getVerseEnd(verse1E) - end; - string text = b1.getText().substr(pos1S, pos1E - pos1S + 1); - vector minunique = find_min_unique(text, moduleName1, 0); - for (string m : minunique) { - vector found = find_all(m, moduleName2, maxresults); - for (string f : found) { + vector refs; // store plausible references in a vector + Book b1 = getBook(book1, moduleName1); // the input passage is in book b1 + int pos1S = b1.getVerseStart(verse1S) + start; // exact start position of the input passage + int pos1E = b1.getVerseEnd(verse1E) - end; // exact end position of the input passage + string text = b1.getText().substr(pos1S, pos1E - pos1S + 1); // the input passage + vector minunique = find_min_unique(text, moduleName1, 0); // get all minimally unique subtexts of the input + for (string m : minunique) { // For each subtext... + vector found = find_all(m, moduleName2, maxresults); // find all plausible verbatim quotations in the other edition + for (string f : found) { // For each candidate... vector info1, info2; boost::split(info1, f, boost::is_any_of(",")); - string book2 = info1[0]; - int pos = stoi(info1[1]); - string ext = _extend(moduleName1, moduleName2, book2, pos, pos + m.length() - 1, 0); + string book2 = info1[0]; // book2 contains the book of the other edition + int pos = stoi(info1[1]); // pos stores the position of plausible verbatim quotation in the other edition + string ext = _extend(moduleName1, moduleName2, book2, pos, pos + m.length() - 1, 0); // Extend the plausible quotation... boost::split(info2, ext, boost::is_any_of(",")); Reference r = {stoi(info2[1]), stoi(info2[3]), stoi(info2[2]), info2[0]}; - refs.push_back(r); + refs.push_back(r); // Store the result. } } - sort(refs.begin(), refs.end(), compareReference); + sort(refs.begin(), refs.end(), compareReference); // Sort the results by text length (of the extension). vector::iterator it; - it = unique(refs.begin(), refs.end(), equalReference); - refs.resize(distance(refs.begin(), it)); - for (Reference r : refs) { + it = unique(refs.begin(), refs.end(), equalReference); // Delete duplicates. + refs.resize(distance(refs.begin(), it)); // Free memory. + for (Reference r : refs) { // Show all references. info(r.m_text + " (length=" + to_string(r.m_length) + ", pos1=" + ot_color + to_string(r.m_pos1 + 1) + reset_color + ", pos2=" + nt_color + to_string(r.m_pos2 + 1) + reset_color + ")"); - if (sql) { + if (sql) { // In sql mode output the required SQL statement skeleton as well. info("insert into quotations (nt_quotation_id, ot_id, nt_id, ot_book, psalm, ot_passage, nt_book, nt_passage, ot_startpos, ot_length, nt_startpos, nt_length, found_method) values"); string output = " (?, ?, ?, '" + ot_color + book1 + reset_color + "', "; output += ot_color; - if (book1.compare("Psalms") == 0) { + if (book1.compare("Psalms") == 0) { // handle Psalms in a special way (because of the SQL database) vector verse1_split; boost::split(verse1_split, verse1S, boost::is_any_of(":")); output += verse1_split[0]; @@ -1104,7 +1119,7 @@ void getrefs(const string& moduleName2, const string& moduleName1, const string& output += nt_color + to_string(r.m_length) + reset_color + ", "; output += "'getrefs');"; - info(output); + info(output); // End of sql mode. } } } diff --git a/books.dox b/books.dox index 420c350..1fe4950 100644 --- a/books.dox +++ b/books.dox @@ -4,9 +4,9 @@ * They contain several books like Genesis and Exodus, or Matthew and Mark (and others). */ -/// -/// Automatically add all traditional books to a Bible edition. -int addBooks(); +/// Load and index the default Bible editions wrt. their a-y encoded texts. +/// Currently LXX, SBLGNT and StatResGNT are loaded. +int addBibles(); /// /// Lookup a verse in a Bible edition, in a given book, a given passage. string lookupVerse(const string& book, const string& info, const string& verse) { @@ -48,9 +48,13 @@ int compare(string verse1, string verse2); /// Inputs are given in a-y encoding. int compareLatin(string verse1, string verse2); /// + +/* /// @deprecated Assume that two passages are in correlation, find their best fit. int findBestFit(const string& book1, const string& info1, const string& verseInfo1s, const string& verseInfo1e, const string& book2, const string& info2, const string& verseInfo2s, const string& verseInfo2e); + */ + /// /// Convert raw input verse to a-y encoding. string processVerse(const string &verse); @@ -66,13 +70,13 @@ string getText(const string& book, const string& info, const string& VerseInfoS, /// @return The requested text. }; /// -/// Find a passage (verbatim) in a Bible edition. +/// Find a passage (verbatim) in a Bible edition and return the number of occurrences. int find(const string& text, const string& moduleName, int maxFound, int verb) { /// @param text The passage to be found in a-y encoding. /// @param moduleName The Bible edition to search, e.g. "LXX". /// @param maxFound The maximal amount of occurrences, then stop. /// @param verb Request verbose mode (1 if yes). - /// @return if the process was successul. + /// @return the number of occurrences. }; /// /// Find a minimally unique subtext in the input text. diff --git a/books.h b/books.h index 2b4f7f8..85f2db6 100644 --- a/books.h +++ b/books.h @@ -3,7 +3,7 @@ #include "fingerprint.h" -int addBooks(); +int addBibles(); string lookupVerse(const string& book, const string& info, const string& verse); Fingerprint getTextFingerprint(const string& book, const string& info, int start, int length); Fingerprint getTextFingerprint(const string& book, const string& info, const string& start, const string& end); @@ -12,8 +12,10 @@ int compare(const string& book1, const string& info1, const string& verseInfo1s, const string& book2, const string& info2, const string& verseInfo2s, const string& verseInfo2e, int startOffset2, int endOffset2); int compare(string verse1, string verse2); int compareLatin(string verse1, string verse2); +/* int findBestFit(const string& book1, const string& info1, const string& verseInfo1s, const string& verseInfo1e, const string& book2, const string& info2, const string& verseInfo2s, const string& verseInfo2e); +*/ string processVerse(const string &verse); string getText(const string& book, const string& info, const string& VerseInfoS, const string& VerseInfoE, int startOffset, int endOffset); int find(const string& text, const string& moduleName, int maxFound, bool verbose); diff --git a/cli.cpp b/cli.cpp index 4748fa9..b8a0542 100644 --- a/cli.cpp +++ b/cli.cpp @@ -182,7 +182,7 @@ void processAddbooksCmd() { if (booksAdded) { error("Books already added."); } else { - if (addBooks() == 0) { + if (addBibles() == 0) { booksAdded = true; } } @@ -823,7 +823,7 @@ void cli(const char *input_prepend, const char *output_prepend, bool addbooks, b info("This is bibref " BIBREF_VERSION ", nice to meet you."); showAvailableBibles(); if (addbooks) { - if (addBooks() == 0) { + if (addBibles() == 0) { booksAdded = true; } }