From e4c643880aaa4ddaed1e401e82d3403eb44716c8 Mon Sep 17 00:00:00 2001 From: Michael Mandl Date: Sat, 23 Mar 2024 11:24:46 +0100 Subject: [PATCH] docs: describe BucketFinder --- lib_vector_search/include/bucket_finder.h | 17 ++++++++++++ lib_vector_search/src/bucket_finder.cpp | 32 +++++++++++++---------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/lib_vector_search/include/bucket_finder.h b/lib_vector_search/include/bucket_finder.h index 4b2dddf..b97ecbc 100644 --- a/lib_vector_search/include/bucket_finder.h +++ b/lib_vector_search/include/bucket_finder.h @@ -10,17 +10,34 @@ private: std::unordered_map directory_; public: + /// Inserts references to all words from word_list between first_index and + /// last_index, including first_index, excluding last_index. void insert(const WordList &word_list, size_t first_index, size_t last_index); + /// Find all words that start with search_term + /// @return A list with references to the results. WordRefList find_prefix(std::string_view search_term) const; }; +/** This class provides efficient, parallel search over a list of strings. + * + * References to all input strings are stored in a tree-like structure that + * provides fast and lock-free parallel insertion and parallel (with minimal + * synchronization) search for all words that start with a given term. + */ class BucketFinder : public Finder { private: std::vector buckets_; public: + /// Creates a BucketFinder over all words in word_list. BucketFinder(const WordList &word_list); + /// Find all words that start with search_term + /// @return A list with references to the results. WordRefList find_prefix(std::string_view search_term) const override; + +private: + /// Inserts references to all words from word_list. + void insert(const WordList &word_list); }; diff --git a/lib_vector_search/src/bucket_finder.cpp b/lib_vector_search/src/bucket_finder.cpp index 8fd7500..03949e1 100644 --- a/lib_vector_search/src/bucket_finder.cpp +++ b/lib_vector_search/src/bucket_finder.cpp @@ -1,8 +1,11 @@ #include "bucket_finder.h" +#include +#include #include -#include +#include #include +#include void Bucket::insert(const WordList &word_list, size_t first_index, size_t last_index) { @@ -28,34 +31,35 @@ WordRefList Bucket::find_prefix(std::string_view search_term) const { return result; } -BucketFinder::BucketFinder(const WordList &word_list) { +BucketFinder::BucketFinder(const WordList &word_list) { insert(word_list); } + +void BucketFinder::insert(const WordList &word_list) { if (word_list.empty()) { return; } + const size_t max_threads = std::thread::hardware_concurrency(); const size_t word_list_size = word_list.size(); - const size_t bucket_count = - std::min(std::thread::hardware_concurrency(), word_list_size); + const size_t bucket_count = std::min(max_threads, word_list_size); const size_t bucket_size = word_list_size / bucket_count; buckets_.resize(bucket_count); - std::vector threads; + std::vector insert_threads; for (auto bucket_index = 0; bucket_index < bucket_count; ++bucket_index) { auto &bucket = buckets_[bucket_index]; - bool is_last_bucket = bucket_index == bucket_count - 1; + const bool is_last_bucket = bucket_index == bucket_count - 1; + const size_t first_word_index = bucket_index * bucket_size; + const size_t last_word_index = + is_last_bucket ? word_list_size : first_word_index + bucket_size; - const size_t first_index = bucket_index * bucket_size; - const size_t last_index = - is_last_bucket ? word_list_size : first_index + bucket_size; - - threads.emplace_back([&, first_index, last_index] { - bucket.insert(word_list, first_index, last_index); + insert_threads.emplace_back([&, first_word_index, last_word_index] { + bucket.insert(word_list, first_word_index, last_word_index); }); } - for (auto &thread : threads) { + for (auto &thread : insert_threads) { thread.join(); } } @@ -69,7 +73,7 @@ WordRefList BucketFinder::find_prefix(std::string_view search_term) const { threads.emplace_back([&] { auto thread_search_results = bucket.find_prefix(search_term); if (!thread_search_results.empty()) { - std::lock_guard result_lock(search_results_mutex); + const std::lock_guard result_lock(search_results_mutex); std::move(thread_search_results.begin(), thread_search_results.end(), std::back_inserter(search_results)); }