Paging Query Results


  • Paging:
    Paging is the process of fetching a subset (a page) of results from a dataset, rather than retrieving the entire results at once. This method enables processing query results one page at a time.

  • Default page size:

    • Querying Lucene indexes:
      If the client's query definition does Not explicitly specify the page size, the server will default to a C# int.MaxValue (2,147,483,647). In such a case, all results will be returned in a single server call.

    • Querying Corax indexes:
      The default page size is the same as the one employed by Lucene.
      Note: when using Corax as the search engine, indexes with more than a C# int.MaxValue entries can be created and used.
      To match this capacity, queries over Corax indexes can skip a number of results that exceed this max value and take documents from that location.

  • Performance:
    Using paging is beneficial when handling large result datasets, contributing to improved performance.
    See paging and performance here below.

  • In this page:


No-paging example

# A simple query without paging:
# ==============================
all_results = list(
    session.query_index_type(Products_ByUnitsInStock, Products_ByUnitsInStock.IndexEntry)
    .where_greater_than("units_in_stock", 10)
    .of_type(Product)
)

# Executing the query on the Northwind sample data
# will result in all 47 Product documents that match the query predicate.
class Products_ByUnitsInStock(AbstractIndexCreationTask):
    def __init__(self):
        super().__init__()
        self.map = "from product in docs.Products select new { units_in_stock = product.UnitsInStock }"

    class IndexEntry:
        def __init__(self, units_in_stock: int = None):
            self.units_in_stock = units_in_stock
from index "Products/ByUnitsInStock"
where UnitsInStock > 10

Paging examples

Retrieve a specific page:

# Retrieve only the 3'rd page - when page size is 10:
# ===================================================
def __stats_callback(statistics: QueryStatistics):
    total_results = statistics.total_results
    # While the query below returns only 10 results,
    # 'total_results' will hold the total number of matching documents (47).

third_page_results = list(
    session.query_index_type(Products_ByUnitsInStock, Products_ByUnitsInStock.IndexEntry)
    # Get the query stats if you wish to know the TOTAL number of results
    .statistics(__stats_callback)
    # Apply some filtering condition as needed
    .where_greater_than("units_in_stock", 10).of_type(Product)
    # Call 'skip', pass the number of items to skip from the beginning of the result set
    # Skip the first 20 resulting documents
    .skip(20)
    # Call 'take' to define the number of documents to return
    # Take up to 10 products => so 10 is the "Page Size"
    .take(10)
)

en executing this query on the Northwind sample data,
sults will include only 10 Product documents ("products/45-A" to "products/54-A")

 store.open_session() as session:
# region paging_2_1
# Query for all results - page by page:
# =====================================
paged_results: List[Product] = []
page_number = 0
page_size = 10

while True:
    paged_results = list(
        session.query_index_type(Products_ByUnitsInStock, Products_ByUnitsInStock.IndexEntry)
        # Apply some filtering condition as needed
        .where_greater_than("units_in_stock", 10).of_type(Product)
        # Skip the number of results that were already fetched
        .skip(page_number * page_size)
        # Request to get 'page_size' results
        .take(page_size)
    )
    page_number += 1

    if len(paged_results) == 0:
        break

    # Make any processing needed with the current paged results here
    # ...
class Products_ByUnitsInStock(AbstractIndexCreationTask):
    def __init__(self):
        super().__init__()
        self.map = "from product in docs.Products select new { units_in_stock = product.UnitsInStock }"

    class IndexEntry:
        def __init__(self, units_in_stock: int = None):
            self.units_in_stock = units_in_stock
from index "Products/ByUnitsInStock"
where UnitsInStock > 10
limit 20, 10 // skip 20, take 10

Page through all results:

# Query for all results - page by page:
# =====================================
paged_results: List[Product] = []
page_number = 0
page_size = 10

while True:
    paged_results = list(
        session.query_index_type(Products_ByUnitsInStock, Products_ByUnitsInStock.IndexEntry)
        # Apply some filtering condition as needed
        .where_greater_than("units_in_stock", 10).of_type(Product)
        # Skip the number of results that were already fetched
        .skip(page_number * page_size)
        # Request to get 'page_size' results
        .take(page_size)
    )
    page_number += 1

    if len(paged_results) == 0:
        break

    # Make any processing needed with the current paged results here
    # ...
class Products_ByUnitsInStock(AbstractIndexCreationTask):
    def __init__(self):
        super().__init__()
        self.map = "from product in docs.Products select new { units_in_stock = product.UnitsInStock }"

    class IndexEntry:
        def __init__(self, units_in_stock: int = None):
            self.units_in_stock = units_in_stock
from index "Products/ByUnitsInStock"
where UnitsInStock > 10
limit 0, 10 // First loop will skip 0, take 10

// The next loops in the code will each generate the above RQL with an increased 'skip' value:
// limit 10, 10
// limit 20, 10
// limit 30, 10
// ...

Paging and performance

Better performance:

It is recommended to explicitly set a page size when making a query that is expected to generate a significant number of results. This practice has several benefits:

  • Optimizes bandwidth usage by reducing data transfer between the server and client.
  • Prevents delays in response times caused by sending too much data over the network.
  • Avoids high memory consumption when dealing with numerous documents.
  • Ensures a more manageable user experience by not overwhelming users with massive datasets at once.

Performance hints:

  • By default, if the number of returned results exceeds 2048, the server will issue a "Page size too big" notification (visible in the Studio) with information about the query.

  • This threshold can be customized by modifying the value of the PerformanceHints.MaxNumberOfResults configuration key.

  • As suggested by this performance hint, you may consider using Streaming query results instead of paging.

    Figure 1. Performance Hint

    Performance Hint

Paging through tampered results

  • The QueryStatistics object contains the total_results property,
    which represents the total number of matching documents found in the query results.

  • The QueryStatistics object also contains the skipped_results property.
    Whenever this property is greater than 0, that implies the server has skipped that number of results from the index.

  • The server will skip duplicate results internally in the following two scenarios:

    1. When making a Projection query with Distinct.

    2. When querying a Fanout index.

  • In these cases:

    • The skipped_results property from the stats object will hold the count of skipped (duplicate) results.

    • The total_results property will be invalidated -
      it will Not deduct the number of skipped results from the total number of results.

  • To do proper paging in these scenarios:
    include the skipped_results value when specifying the number of documents to skip for each page using:
    (current_page * page_size) + skipped_results.

Examples

A projection query with Distinct:

paged_results: List[ProjectedClass] = []

total_results = 0
total_unique_results = 0
skipped_results = 0

page_number = 0
page_size = 10

def __stats_callback(statistics: QueryStatistics):
    total_results = statistics.total_results
    nonlocal skipped_results
    skipped_results += statistics.skipped_results

while True:
    paged_results = list(
        session.query_index_type(Products_ByUnitsInStock, Products_ByUnitsInStock.IndexEntry)
        .statistics(__stats_callback)
        .where_greater_than("units_in_stock", 10)
        .of_type(Product)
        # Define a projection
        .select_fields(ProjectedClass)
        # Call distinct to remove duplicate projected results
        .distinct()
        # Add the number of skipped results to the "start location"
        .skip((page_size * page_size) + skipped_results)
        .take(page_size)
    )

    total_unique_results += len(paged_results)

    if len(paged_results) == 0:
        break

# When executing the query on the Northwind sample data:
# ======================================================

# The total matching results reported in the stats is 47 (totalResults),
# but the total unique objects returned while paging the results is only 29 (totalUniqueResults)
# due to the 'Distinct' usage which removes duplicates.

# This is solved by adding the skipped results count to Skip().
class Products_ByUnitsInStock(AbstractIndexCreationTask):
    def __init__(self):
        super().__init__()
        self.map = "from product in docs.Products select new { units_in_stock = product.UnitsInStock }"

    class IndexEntry:
        def __init__(self, units_in_stock: int = None):
            self.units_in_stock = units_in_stock
class ProjectedClass:
    def __init__(self, category: str = None, supplier: str = None):
        self.category = category
        self.supplier = supplier

    # Handle different casing by implementing from_json class method
    @classmethod
    def from_json(cls, json_dict: Dict[str, Any]):
        return cls(json_dict["Category"], json_dict["Supplier"])
from index "Products/ByUnitsInStock"
where UnitsInStock > 10
select distinct Category, Supplier
limit 0, 10  // First loop will skip 0, take 10, etc.

Querying a Fanout index:

paged_results: List[Order] = []

total_results = 0
total_unique_results = 0
skipped_results = 0

page_number = 0
page_size = 50

def __stats_callback(statistics: QueryStatistics):
    nonlocal skipped_results
    skipped_results += statistics.skipped_results
    total_results = statistics.total_results

while True:
    paged_results = list(
        session.query_index_type(Orders_ByProductName, Orders_ByProductName.IndexEntry)
        .statistics(__stats_callback)
        .of_type(Order)
        # Add the number of skipped results to the "start location"
        .skip((page_size * page_size) + skipped_results)
        .take(page_size)
    )

    total_unique_results += len(paged_results)

    if len(paged_results) == 0:
        break

# When executing the query on the Northwind sample data:
# ======================================================

# The total results reported in the stats is 2155 (total_results),
# which represent the multiple index-entries generated as defined by the fanout index.

# By adding the skipped results count to the skip() method,
# we get the correct total unique results which is 830 Order documents.
# A fanout index - creating MULTIPLE index-entries per document:
# ==============================================================
class Orders_ByProductName(AbstractIndexCreationTask):
    class IndexEntry:
        def __init__(self, product_name: str = None):
            self.product_name = product_name

    def __init__(self):
        super().__init__()
        self.map = "from order in docs.Orders from line in order.Lines select new { product_name = line.ProductName }"
from index "Orders/ByProductName"
limit 0, 50  // First loop will skip 0, take 50, etc.