BM25 part 3 | Naive Benchmark

blogging
embedding
qdrant
BM25 Benchmarked with bm25-sparse , rankbm_25 , numpy vectorization, scipy and normal python code.
Author

kareem

Published

December 20, 2025

import marimo as mo
from collections import Counter
from math import log
from scipy.sparse import csr_matrix
import numpy as np


def tokenize(text):
    return text.lower().split()

Dataset Benchmark

I will continue exploring the bm25 and how to use with a huggingface dataset with multiple implementations and compare their speed

The methods i will use:

  1. Normal Python Calculations
  2. Spicy sparse matrix with python loops
  3. Spicy sparse matrix with Numpy Vectorization
  4. Rank_bm25
  5. bm25-sparse
  6. Polars optimization

If there is any mistake please tell me!

it’s first time for me to create a benchmark for anything

from datasets import load_dataset
import time
import random

# Load dataset
dataset = load_dataset("rotten_tomatoes", split="train")
corpus = [item["text"] for item in dataset]
print(f"Corpus size: {len(corpus)} reviews")


def generate_random_queries(corpus, num_queries=1000):
    queries = []
    for _ in range(num_queries):
        doc = random.choice(corpus)
        words = tokenize(doc)
        num_words = random.randint(2, 4)
        if len(words) >= num_words:
            query_words = random.sample(words, num_words)
        else:
            query_words = words
        queries.append(" ".join(query_words))
    return queries


test_queries = generate_random_queries(corpus, 1000)
print(f"Generated {len(test_queries)} test queries")
Corpus size: 8530 reviews
Generated 1000 test queries
def benchmark_python_loops():
    print("=== 1. NORMAL PYTHON (LOOPS) ===")
    start = time.time()

    # Tokenize all documents
    all_docs_terms_py = []
    for doc in corpus:
        tokens = tokenize(doc)
        all_docs_terms_py.append(Counter(tokens))

    # Compute DF
    df_py = {}
    for doc_tf in all_docs_terms_py:
        for term in doc_tf.keys():
            df_py[term] = df_py.get(term, 0) + 1

    # Average doc length
    total_len = sum(sum(doc_tf.values()) for doc_tf in all_docs_terms_py)
    avg_doc_len_py = total_len / len(corpus)

    # IDF function
    def compute_idf_py(term, df_dict, N):
        if term not in df_dict:
            return 0
        return np.log((N - df_dict[term] + 0.5) / (df_dict[term] + 0.5))

    # BM25 function
    def bm25_score_py(term, doc_tf, idf_val, doc_len, avg_len, k1=1.5, b=0.75):
        tf = doc_tf.get(term, 0)
        if tf == 0:
            return 0
        return idf_val * tf * (k1 + 1) / (tf + k1 * (1 - b + b * (doc_len / avg_len)))

    index_time = time.time() - start

    # Query
    start_query = time.time()
    for query in test_queries:
        query_tokens = tokenize(query)
        doc_scores = [0] * len(corpus)
        for token in query_tokens:
            if token in df_py:
                idf_val = compute_idf_py(token, df_py, len(corpus))
                for doc_idx, doc_tf in enumerate(all_docs_terms_py):
                    doc_len = sum(doc_tf.values())
                    doc_scores[doc_idx] += bm25_score_py(
                        token, doc_tf, idf_val, doc_len, avg_doc_len_py
                    )
        top_k = sorted(
            range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True
        )[:10]

    query_time = time.time() - start_query
    qps = len(test_queries) / query_time

    print(f"Indexing: {index_time:.4f}s")
    print(f"Query time: {query_time:.4f}s")
    print(f"QPS: {qps:.2f}\n")

    return index_time, query_time, qps


benchmark_python_loops()
=== 1. NORMAL PYTHON (LOOPS) ===
Indexing: 0.0801s
Query time: 23.6460s
QPS: 42.29
(0.08005261421203613, 23.645965099334717, 42.29051323551751)

Why Sparse Matrices?

Right now, to search for “python programming”, you’d need to: 1. Loop through each document 2. Calculate BM25 score for “python” 3. Calculate BM25 score for “programming” 4. Add them up

That’s slow for large datasets!

The key insight: We can pre-compute ALL BM25 scores for ALL words in ALL documents and store them in a matrix. Then searching becomes just looking up rows and adding them.

Here’s what the matrix looks like conceptually:

               Doc0   Doc1   Doc2
python         -0.51  -0.56   0.00
programming    -1.95  -2.14  -1.79
java            0.00   0.00   2.20
def benchmark_sparse_python_loops():
    print("=== 2. SPARSE MATRIX WITH PYTHON LOOPS ===")
    start = time.time()

    # Tokenize
    all_docs_terms_sp = []
    for doc in corpus:
        tokens = tokenize(doc)
        all_docs_terms_sp.append(Counter(tokens))

    # Get all words and create word_to_idx
    all_words_sp = list(set(word for doc in all_docs_terms_sp for word in doc))
    word_to_idx_sp = {word: idx for idx, word in enumerate(all_words_sp)}

    # Compute DF
    df_sp = {}
    for doc_tf in all_docs_terms_sp:
        for term in doc_tf.keys():
            df_sp[term] = df_sp.get(term, 0) + 1

    # Average doc length
    avg_doc_len_sp = sum(sum(doc_tf.values()) for doc_tf in all_docs_terms_sp) / len(
        corpus
    )

    # Build sparse matrix
    rows_idx = []
    cols_idx = []
    data = []

    for word_idx, word in enumerate(all_words_sp):
        idf_val = np.log((len(corpus) - df_sp[word] + 0.5) / (df_sp[word] + 0.5))
        for doc_idx, doc_tf in enumerate(all_docs_terms_sp):
            doc_len = sum(doc_tf.values())
            tf = doc_tf.get(word, 0)
            if tf != 0:
                score = (
                    idf_val
                    * tf
                    * 2.5
                    / (tf + 1.5 * (1 - 0.75 + 0.75 * (doc_len / avg_doc_len_sp)))
                )
                rows_idx.append(word_idx)
                cols_idx.append(doc_idx)
                data.append(score)

    sparse_matrix_sp = csr_matrix(
        (data, (rows_idx, cols_idx)), shape=(len(all_words_sp), len(corpus))
    )

    index_time = time.time() - start

    # Query
    start_query = time.time()
    for query in test_queries:
        query_indices = [
            word_to_idx_sp[word] for word in tokenize(query) if word in word_to_idx_sp
        ]
        if query_indices:
            doc_scores = np.array(
                sparse_matrix_sp[query_indices, :].sum(axis=0)
            ).flatten()
            top_k = np.argsort(doc_scores)[-10:][::-1]

    query_time = time.time() - start_query
    qps = len(test_queries) / query_time

    print(f"Indexing: {index_time:.4f}s")
    print(f"Query time: {query_time:.4f}s")
    print(f"QPS: {qps:.2f}\n")

    return index_time, query_time, qps


benchmark_sparse_python_loops()
=== 2. SPARSE MATRIX WITH PYTHON LOOPS ===
Indexing: 77.9146s
Query time: 0.4709s
QPS: 2123.64
(77.91458225250244, 0.4708900451660156, 2123.6380133019015)

Performance Optimization with Numpy

My code has python loops everywhere. Python loops are very slow. The goal here it to use Numpy Vectorization: doing operations on entiry arrays at once instead of looping :

for word_1 in all_words:
for doc_idx, doc_1 in enumerate(all_docs_terms):
    idf_python_1 = compute_idf(word_1, 3, df)
    doc_len_1 = sum(all_docs_terms[doc_idx].values())
    score_1 = bm25_score(word_1, doc_1, idf_python_1, doc_len_1, avg_doc_len)

This has nested loops => very slow for large datasets. Key Insight: Instead of computing one BM25 score at a time, we can compute all scores at once using matrix opertions.

def benchmark_sparse_numpy():
    print("=== 3. SPARSE MATRIX WITH NUMPY VECTORIZATION ===")
    start = time.time()

    # Tokenize
    all_docs_terms_np = []
    for doc in corpus:
        tokens = tokenize(doc)
        all_docs_terms_np.append(Counter(tokens))

    all_words_np = list(set(word for doc in all_docs_terms_np for word in doc))
    word_to_idx_np = {word: idx for idx, word in enumerate(all_words_np)}

    # Build TF matrix
    rows = []
    cols = []
    tf_data = []

    for doc_idx, doc_tf in enumerate(all_docs_terms_np):
        for word, tf in doc_tf.items():
            rows.append(word_to_idx_np[word])
            cols.append(doc_idx)
            tf_data.append(tf)

    tf_matrix_np = csr_matrix(
        (tf_data, (rows, cols)), shape=(len(all_words_np), len(corpus))
    )

    # Compute DF array
    df_array_np = np.array((tf_matrix_np > 0).sum(axis=1)).flatten()

    # Document lengths
    doc_lengths_np = np.array(tf_matrix_np.sum(axis=0)).flatten()
    avg_doc_len_np = doc_lengths_np.mean()

    # Compute IDF array
    N = len(corpus)
    idf_array_np = np.log((N - df_array_np + 0.5) / (df_array_np + 0.5))

    # Vectorized BM25
    k1 = 1.5
    b = 0.75
    tf_dense = tf_matrix_np.toarray()

    numerator = tf_dense * (k1 + 1)
    length_norm = 1 - b + b * (doc_lengths_np / avg_doc_len_np)
    denominator = tf_dense + k1 * length_norm

    idf_column = idf_array_np.reshape(-1, 1)
    bm25_scores = idf_column * (numerator / (denominator + 1e-10))

    bm25_matrix_np = csr_matrix(bm25_scores)

    index_time = time.time() - start

    # Query
    start_query = time.time()
    for query in test_queries:
        query_indices = [
            word_to_idx_np[word] for word in tokenize(query) if word in word_to_idx_np
        ]
        if query_indices:
            doc_scores = np.array(
                bm25_matrix_np[query_indices, :].sum(axis=0)
            ).flatten()
            top_k = np.argsort(doc_scores)[-10:][::-1]

    query_time = time.time() - start_query
    qps = len(test_queries) / query_time

    print(f"Indexing: {index_time:.4f}s")
    print(f"Query time: {query_time:.4f}s")
    print(f"QPS: {qps:.2f}\n")

    return index_time, query_time, qps


benchmark_sparse_numpy()
=== 3. SPARSE MATRIX WITH NUMPY VECTORIZATION ===
Indexing: 6.9885s
Query time: 0.5250s
QPS: 1904.79
(6.988537311553955, 0.5249927043914795, 1904.7883744577036)
def benchmark_rank_bm25():
    from rank_bm25 import BM25Okapi

    print("=== 4. RANK-BM25 ===")

    tokenized_corpus_rank = [tokenize(doc) for doc in corpus]

    start = time.time()
    bm25_rank = BM25Okapi(tokenized_corpus_rank)
    index_time = time.time() - start

    start_query = time.time()
    for query in test_queries:
        query_tokens = tokenize(query)
        scores = bm25_rank.get_scores(query_tokens)
        top_k = np.argsort(scores)[-10:][::-1]

    query_time = time.time() - start_query
    qps = len(test_queries) / query_time

    print(f"Indexing: {index_time:.4f}s")
    print(f"Query time: {query_time:.4f}s")
    print(f"QPS: {qps:.2f}\n")

    return index_time, query_time, qps


benchmark_rank_bm25()
=== 4. RANK-BM25 ===
Indexing: 0.0813s
Query time: 5.0695s
QPS: 197.26
(0.08132147789001465, 5.069519281387329, 197.2573619892297)
def benchmark_bm25s():
    import bm25s

    print("=== 5. BM25S ===")

    start = time.time()
    retriever_bm25s = bm25s.BM25()
    corpus_tokens_bm25s = bm25s.tokenize(corpus, show_progress=False)
    retriever_bm25s.index(corpus_tokens_bm25s, show_progress=False)
    index_time = time.time() - start

    start_query = time.time()
    for query in test_queries:
        query_tokens_bm25s = bm25s.tokenize(query, show_progress=False)
        results, scores = retriever_bm25s.retrieve(
            query_tokens_bm25s, k=10, show_progress=False
        )

    query_time = time.time() - start_query
    qps = len(test_queries) / query_time

    print(f"Indexing: {index_time:.4f}s")
    print(f"Query time: {query_time:.4f}s")
    print(f"QPS: {qps:.2f}\n")

    return index_time, query_time, qps


benchmark_bm25s()
=== 5. BM25S ===
Indexing: 0.5329s
Query time: 0.7049s
QPS: 1418.74
(0.5328564643859863, 0.7048521041870117, 1418.7373408687158)
def benchmark_polars():
    import polars as pl

    print("=== 6. POLARS OPTIMIZATION ===")
    start = time.time()

    k1 = 1.5
    b = 0.75
    N = len(corpus)

    # Create LazyFrame
    lf = pl.LazyFrame({"doc_id": range(N), "text": corpus})

    # Tokenize and explode
    lf_tokens = (
        lf.with_columns(
            pl.col("text").str.to_lowercase().str.split(" ").alias("tokens")
        )
        .explode("tokens")
        .filter(pl.col("tokens").str.len_chars() > 0)
    )

    # Build vocab
    vocab_df = (
        lf_tokens.select(pl.col("tokens").unique()).collect().with_row_index("word_idx")
    )
    vocab_size = len(vocab_df)

    # Join to map tokens to indices
    lf_tokens = lf_tokens.join(vocab_df.lazy(), on="tokens", how="left")

    # Compute TF
    lf_tf = lf_tokens.group_by(["doc_id", "word_idx"]).agg(pl.len().alias("tf"))

    # Compute document lengths
    lf_doc_lens = lf_tf.group_by("doc_id").agg(pl.col("tf").sum().alias("doc_len"))
    avg_doc_len_pl = lf_doc_lens.select(pl.col("doc_len").mean()).collect().item()

    # Compute DF and IDF
    lf_idf = (
        lf_tf.group_by("word_idx")
        .agg(pl.col("doc_id").n_unique().alias("df"))
        .with_columns(
            ((pl.lit(N) - pl.col("df") + 0.5) / (pl.col("df") + 0.5)).log().alias("idf")
        )
    )

    # Calculate BM25
    df_bm25_pl = (
        lf_tf.join(lf_doc_lens, on="doc_id", how="left")
        .join(lf_idf, on="word_idx", how="left")
        .with_columns(
            (
                pl.col("idf")
                * pl.col("tf")
                * (k1 + 1)
                / (
                    pl.col("tf")
                    + k1 * (1 - b + b * (pl.col("doc_len") / avg_doc_len_pl))
                )
            ).alias("bm25_score")
        )
        .select(["doc_id", "word_idx", "bm25_score"])
        .collect()
    )

    # Build sparse matrix
    rows_pl = df_bm25_pl["word_idx"].to_numpy()
    cols_pl = df_bm25_pl["doc_id"].to_numpy()
    data_pl = df_bm25_pl["bm25_score"].to_numpy()
    word_to_idx_pl = dict(zip(vocab_df["tokens"], vocab_df["word_idx"]))
    bm25_matrix_pl = csr_matrix((data_pl, (rows_pl, cols_pl)), shape=(vocab_size, N))

    index_time = time.time() - start

    # Query
    start_query = time.time()
    for query in test_queries:
        query_indices = [
            word_to_idx_pl[word] for word in tokenize(query) if word in word_to_idx_pl
        ]
        if query_indices:
            doc_scores = np.array(
                bm25_matrix_pl[query_indices, :].sum(axis=0)
            ).flatten()
            top_k = np.argsort(doc_scores)[-10:][::-1]

    query_time = time.time() - start_query
    qps = len(test_queries) / query_time

    print(f"Indexing: {index_time:.4f}s")
    print(f"Query time: {query_time:.4f}s")
    print(f"QPS: {qps:.2f}\n")

    return index_time, query_time, qps


benchmark_polars()
=== 6. POLARS OPTIMIZATION ===
Indexing: 0.1580s
Query time: 0.6143s
QPS: 1627.94
(0.1579906940460205, 0.6142714023590088, 1627.9449053946898)

🏆 BM25 Benchmark Results (8,530 documents, 1000 queries)

Implementation Indexing (s) Query (s) QPS
1. Python Loops 0.0801 23.6460 42.29
2. Sparse + Python Loops 77.9146 0.4709 2,123.64
3. Sparse + NumPy 6.9885 0.5250 1,904.79
4. rank-bm25 0.0813 5.0695 197.26
5. bm25s 0.5329 0.7049 1,418.74
6. Polars 0.1580 0.6143 1,627.94

Key Findings:

Category Winner Value
Fastest Indexing Python Loops 0.0801s
Fastest Query Sparse + Python Loops 0.4709s
Highest QPS Sparse + Python Loops 2,123.64
Best Balance Polars 0.1580s index, 1,627.94 QPS

References

  1. bm25 part 1
  2. bm25 part 2