Query by Facets


  • A Faceted Search provides an efficient way to explore and navigate through large datasets or search results.

  • Multiple filters (facets) are applied to narrow down the search results according to different attributes or categories.

Facets


Define an index

  • To make a faceted search, a static-index must be defined for the fields you want to query and apply facets on.

  • The examples in this article will be based on the following Class, Index, and Sample Data:

class Camera:
    def __init__(
        self,
        manufacturer: str = None,
        cost: float = None,
        mega_pixels: float = None,
        max_focal_length: int = None,
        units_in_stock: int = None,
    ):
        self.manufacturer = manufacturer
        self.cost = cost
        self.mega_pixels = mega_pixels
        self.max_focal_length = max_focal_length
        self.units_in_stock = units_in_stock
class Cameras_ByFeatures(AbstractIndexCreationTask):
    class IndexEntry:
        def __init__(
            self,
            brand: str = None,
            price: float = None,
            mega_pixels: float = None,
            max_focal_length: int = None,
            units_in_stock: int = None,
        ):
            self.brand = brand
            self.price = price
            self.mega_pixels = mega_pixels
            self.max_focal_length = max_focal_length
            self.units_in_stock = units_in_stock

    def __init__(self):
        super().__init__()
        self.map = (
            "from camera in docs.Cameras "
            "select new "
            "{ "
            " brand = camera.manufacturer,"
            " price = camera.cost,"
            " mega_pixels = camera.mega_pixels,"
            " max_focal_length = camera.max_focal_length,"
            " units_in_stock = camera.units_in_stock"
            "}"
        )
# Creating sample data for the examples in this article:
# ======================================================

cameras = [
    Camera(manufacturer="Sony", cost=100, mega_pixels=20.1, max_focal_length=200, units_in_stock=10),
    Camera(manufacturer="Sony", cost=200, mega_pixels=29, max_focal_length=250, units_in_stock=15),
    Camera(manufacturer="Nikon", cost=120, mega_pixels=22.3, max_focal_length=300, units_in_stock=2),
    Camera(manufacturer="Nikon", cost=180, mega_pixels=32, max_focal_length=300, units_in_stock=5),
    Camera(manufacturer="Nikon", cost=220, mega_pixels=40, max_focal_length=300, units_in_stock=20),
    Camera(manufacturer="Canon", cost=200, mega_pixels=30.4, max_focal_length=400, units_in_stock=30),
    Camera(manufacturer="Olympus", cost=250, mega_pixels=32.5, max_focal_length=600, units_in_stock=4),
    Camera(manufacturer="Olympus", cost=390, mega_pixels=40, max_focal_length=600, units_in_stock=6),
    Camera(manufacturer="Fuji", cost=410, mega_pixels=45, max_focal_length=700, units_in_stock=1),
    Camera(manufacturer="Fuji", cost=590, mega_pixels=45, max_focal_length=700, units_in_stock=5),
    Camera(manufacturer="Fuji", cost=650, mega_pixels=61, max_focal_length=800, units_in_stock=17),
    Camera(manufacturer="Fuji", cost=850, mega_pixels=102, max_focal_length=800, units_in_stock=19),
]

with store.open_session() as session:
    for camera in cameras:
        session.store(camera)
    session.save_changes()

Facets - Basics

Facets definition:

  • Define a list of facets by which to aggregate the data.

  • There are two Facet types:

    • Facet - returns a count for each unique term found in the specified index-field.
    • RangeFacet - returns a count per range within the specified index-field.

# Define a Facet:
# ===============
facet = Facet(
    # Specify the index-field for which to get count of documents per unique ITEM
    # e.g. get the number of Camera documents for each unique brand
    field_name="brand",
)

# Set a display name for this field in the results (optional)
facet.display_field_name = "Camera Brand"

# Define a RangeFacet:
# ====================
range_facet = RangeFacet()
# Specify ranges within an index-field in order to get count per RANGE
# e.g. get the number of Camera documents that cost below 200, between 200 & 400, etc...
range_facet.ranges = [
    "price < 200",
    "price between 200 and 400",
    "price between 400 and 600",
    "price between 600 and 800",
    "price >= 800",
]

# Set a display name for this field in the results (optional)
range_facet.display_field_name = "Camera Price"

# Define a list of facets to query by:
# ====================================
facets = [facet, range_facet]

Query the index for facets results:

  • Query the index to get the aggregated facets information.

  • Either:

    • Pass the facets definition from above directly to the query

    • Or - construct a facet using a builder with the Fluent API option, as shown below.

results = (
    session
    # Query the index
    .query_index_type(Cameras_ByFeatures, Cameras_ByFeatures.IndexEntry)
    # Call 'aggregate_by' to aggregate the data by facets
    # Pass the defined facets from above
    .aggregate_by_facets(facets).execute()
)
# Query the index
results = (
    session.query_index_type(Cameras_ByFeatures, Cameras_ByFeatures.IndexEntry)
    # Call 'aggregate_by' to aggregate the data by facets
    # Use a builder as follows:
    .aggregate_by(
        lambda builder: builder
        # Specify the index-field (e.g. 'brand') for which to get count per unique ITEM
        .by_field("brand")
        # Set a display name for the field in the results (optional)
        .with_display_name("Camera Brand")
    )
    .and_aggregate_by(
        lambda builder: builder
        # Specify ranges within an index field (e.g. 'Price') in order to get count per RANGE
        .by_ranges(
            RangeBuilder("price").is_less_than(200),
            RangeBuilder("price").is_greater_than_or_equal_to(200).is_less_than(400),
            RangeBuilder("price").is_greater_than_or_equal_to(400).is_less_than(600),
            RangeBuilder("price").is_greater_than_or_equal_to(600).is_less_than(800),
            RangeBuilder("price").is_greater_than_or_equal_to(800),
        )
        # Set a display name for the field in the results (optional)
        .with_display_name("Camera Price")
    )
    .execute()
)
results = (
    session.advanced
    # Query the index
    # Provide the RQL string to the raw_query method
    .raw_query(
        """from index 'Cameras/ByFeatures'
       select
           facet(brand) as 'Camera Brand',
           facet(price < 200.0,
                 price >= 200.0 and price < 400.0,
                 price >= 400.0 and price < 600.0,
                 price >= 600.0 and price < 800.0,
                 price >= 800.0) as 'Camera Price'""",
        object_type=Camera,
    )
    # Execute the query
    .execute_aggregation()
)
from index "Cameras/ByFeatures"
select
    facet(Brand) as "Camera Brand",
    facet(Price < 200.0,
          Price >= 200.0 and Price < 400.0,
          Price >= 400.0 and Price < 600.0,
          Price >= 600.0 and Price < 800.0,
          Price >= 800.0) as "Camera Price"

Query results:

  • Query results are Not the collection documents, they are of type:
    Dict[str, FacetResult] which is the facets results per index-field specified.

  • Using the sample data from this article, the resulting aggregations will be:

# The resulting aggregations per display name will contain:
# =========================================================

# For the "Camera Brand" Facet:
#     "canon"   - Count: 1
#     "fuji"    - Count: 4
#     "nikon"   - Count: 3
#     "olympus" - Count: 2
#     "sony"    - Count: 2

# For the "Camera Price" Ranges:
#     "price < 200"                      - Count: 3
#     "200 <= price < 400" - Count: 5
#     "400 <= price < 600" - Count: 2
#     "600 <= price < 800" - Count: 1
#     "price >= 800"                   - Count: 1
# Get facets results for index-field 'brand' using the display name specified:
# ============================================================================
brand_facets = results["Camera Brand"]
number_of_brands = len(brand_facets.values)  # 5 unique brands

# Get the aggregated facet value for a specific brand:
facet_value = brand_facets.values[0]
# The brand name is available in the 'Range' property
# Note: value is lower-case since the default RavenDB analyzer was used by the index
self.assertEqual("canon", facet_value.range_)
# Number of documents for 'Canon' is available in the 'Count' property
self.assertEqual(1, facet_value.count_)

# Get facets results for index-field 'Price' using the display name specified:
# ============================================================================
price_facets = results["Camera Price"]
number_of_ranges = len(price_facets.values)  # 5 different ranges

# Get the aggregated facet value for a specific Range:
facet_value = price_facets.values[0]
self.assertEqual("price < 200", facet_value.range_)  # The range string
self.assertEqual(3, facet_value.count_)

Query further:

  • Typically, after presenting users with the initial facets results which show the available options,
    users can select specific categories to explore further.

  • For example, if the user selects Fuji and Nikon,
    then your next query can include a filter to focus only on those selected brands.

filtered_results = list(
    session.query_index_type(Cameras_ByFeatures, Cameras_ByFeatures.IndexEntry)
    .where_in("brand", ["Fuji", "Nikon"])
    .aggregate_by_facets(facets)
    .execute()
)

Facets - Options

Facets definition:

  • Options are available only for the Facet type.

  • Available options:

    • start - The position from which to send items (how many to skip).
    • page_size - Number of items to return.
    • include_remaining_terms - Show summary of items that didn't make it into the requested PageSize.
    • term_sort_mode - Set the sort order on the resulting items.

# Define the list of facets to query by:
# ======================================
facets_with_options = [
    # Define a Facet:
    Facet(
        # Specify the index-field for which to get count of documents per unique ITEM
        field_name="brand",
    )
]
# Set some facets options
# Assign facet options after creating the object
facets_with_options[0].options = FacetOptions()
# Return the top 3 brands with most items count:
facets_with_options[0].options.page_size = 3
facets_with_options[0].options.term_sort_mode = FacetTermSortMode.COUNT_DESC
facets_with_options[0].options.start = 0

Query the index for facets results:

results = (
    session
    # Query the index
    .query_index_type(Cameras_ByFeatures, Cameras_ByFeatures.IndexEntry)
    # Call 'aggregate_by' to aggregate the data by facets
    # Pass the defined facets from above
    .aggregate_by_facets(facets_with_options).execute()
)
# Return the top 3 brands with most items count:
facet_options = FacetOptions()
facet_options.start = 0
facet_options.page_size = 3
facet_options.term_sort_mode = FacetTermSortMode.COUNT_DESC

results = (
    session
    # Query the index
    .query_index_type(Cameras_ByFeatures, Cameras_ByFeatures.IndexEntry)
    # Call 'aggregate_by' to aggregate the data by facets
    # Use a builder as follows:
    .aggregate_by(
        lambda builder: builder
        # Specify an index-field (e.g. 'brand') for which to get count per unique ITEM
        .by_field("brand")
        # Specify the facets options
        .with_options(facet_options)
    ).execute()
)
results = (
    session.advanced
    # Query the index
    # Provide the RQL string to the raw_query method
    .raw_query(
        """from index 'Cameras/ByFeatures'
       select facet(brand, $p0)""",
        object_type=Camera,
    )
    # Add the facet options to the "p0" parameter
    .add_parameter("p0", {"PageSize": 3, "TermSortMode": FacetTermSortMode.COUNT_DESC})
    # Execute the query
    .execute_aggregation()
)
from index "Cameras/ByFeatures"
select facet(Brand, $p0)
{"p0": { "TermSortMode": "CountDesc", "PageSize": 3 }}

Query results:

# The resulting items will contain:
# =================================
# For the "brand" Facet:
#     "fuji"    - Count: 4
#     "nikon"   - Count: 3
#     "olympus" - Count: 2
# As requested, only 3 unique items are returned, ordered by documents count descending:
# Get facets results for index-field 'brand':
# ===========================================
brand_facets = results["brand"]
number_of_brands = len(brand_facets.values)  # 3 brands

# Get the aggregated facet value for a specific brand:
facet_value = brand_facets.values[0]
# The brand name is available in the 'Range' property
# Note: value is lower-case since the default RavenDB analyzer was used by the index
self.assertEqual("fuji", facet_value.range_)
# Number of documents for 'Fuji' is available in the 'Count' property
self.assertEqual(4, facet_value.count_)

Facets - Aggregations

Facets definition:

  • Aggregation of data is available for an index-field per unique Facet or Range item.
    For example:

    • Get the total number of UnitsInStock per Brand
    • Get the highest MegaPixels value for documents that cost between 200 & 400
  • The following aggregation operations are available:

    • Sum
    • Average
    • Min
    • Max
  • Multiple operations can be added on each facet, for multiple fields.

# Define the list of facets to query by:
# =====================================

# Define a facet:
# ===============
facet_with_aggregations = Facet()
facet_with_aggregations.field_name = "brand"
facet_with_aggregations.aggregations = {
    # Set the aggregation operation:
    FacetAggregation.SUM:
    # Create a set specifying the index-fields for which to perform the aggregation
    {
        # Get total number of units_in_stock per brand
        FacetAggregationField("units_in_stock")
    },
    FacetAggregation.AVERAGE: {
        # Get average price per brand
        FacetAggregationField("price")
    },
    FacetAggregation.MIN: {
        # Get min price per brand
        FacetAggregationField("price")
    },
    FacetAggregation.MAX: {
        # Get max mega_pixels per brand
        FacetAggregationField("mega_pixels"),
        # Get max max_focal_length per brand
        FacetAggregationField("max_focal_length"),
    },
}

# Define a RangeFacet:
# ===================
range_facet_with_aggregations = RangeFacet()
range_facet_with_aggregations.ranges = [
    "price < 200",
    "price between 200 and 400",
    "price between 400 and 600",
    "price between 600 and 800",
    "price >= 800",
]
range_facet_with_aggregations.aggregations = {
    FacetAggregation.SUM: {
        # Get total number of units_in_stock for each group of documents per range specified
        FacetAggregationField("units_in_stock")
    },
    FacetAggregation.AVERAGE: {
        # Get average price of each group of documents per range specified
        FacetAggregationField("price")
    },
    FacetAggregation.MIN: {
        # Get min price of each group of documents per range specified
        FacetAggregationField("price")
    },
    FacetAggregation.MAX: {
        # Get max mega_pixels for each group of documents per range specified
        FacetAggregationField("mega_pixels"),
        # Get max max_focal_length for each group of documents per range specified
        FacetAggregationField("max_focal_length"),
    },
}

facets_with_aggregations = [facet_with_aggregations, range_facet_with_aggregations]

Query the index for facets results:

results = (
    session
    # Query the index
    .query_index_type(Cameras_ByFeatures, Cameras_ByFeatures.IndexEntry)
    # Call 'aggregate_by_facets' to aggregate the data by facets
    # Pass the defined facets from above
    .aggregate_by_facets(facets_with_aggregations).execute()
)
results = (
    session
    # Query the index
    .query_index_type(Cameras_ByFeatures, Cameras_ByFeatures.IndexEntry)
    # Call 'aggregate_by' to aggregate the data by facets
    # Use a builder as follows:
    .aggregate_by(
        lambda builder: builder
        # Specify an index-field (e.g. 'brand') for which to get count per unique ITEM
        .by_field("brand")
        # Specify the aggregations per the brand facet:
        .sum_on("units_in_stock")
        .average_on("price")
        .min_on("price")
        .max_on("mega_pixels")
        .max_on("max_focal_length")
    )
    .and_aggregate_by(
        lambda builder: builder
        # Specify ranges within an index field (e.g. 'price') in order to get count per RANGE
        .by_ranges(
            RangeBuilder("price").is_less_than(200),
            RangeBuilder("price").is_greater_than_or_equal_to(200).is_less_than(400),
            RangeBuilder("price").is_greater_than_or_equal_to(400).is_less_than(600),
            RangeBuilder("price").is_greater_than_or_equal_to(600).is_less_than(800),
            RangeBuilder("price").is_greater_than_or_equal_to(800),
        )
        # Specify the aggregations per the price range:
        .sum_on("units_in_stock")
        .average_on("price")
        .min_on("price")
        .max_on("mega_pixels")
        .max_on("max_focal_length")
    )
    .execute()
)
results = (
    session.advanced
    # Query the index
    # Provide the RQL string to the raw_query method
    .raw_query(
        """
           from index 'Cameras/ByFeatures'
                                select
                                    facet(brand,
                                          sum(units_in_stock),
                                          avg(price),
                                          min(price),
                                          max(mega_pixels),
                                          max(max_focal_length)),
                                    facet(price < $p0,
                                          price >= $p1 and price < $p2,
                                          price >= $p3 and price < $p4,
                                          price >= $p5 and price < $p6,
                                          price >= $p7,
                                          sum(units_in_stock),
                                          avg(price),
                                          min(price),
                                          max(mega_pixels),
                                          max(max_focal_length))
           """
    )
    .add_parameter("p0", 200.0)
    .add_parameter("p1", 200.0)
    .add_parameter("p2", 400.0)
    .add_parameter("p3", 400.0)
    .add_parameter("p4", 600.0)
    .add_parameter("p5", 600.0)
    .add_parameter("p6", 800.0)
    .add_parameter("p7", 800.0)
    # Execute the query
    .execute_aggregation()
)
from index "Cameras/ByFeatures"
select
    facet(Brand,
          sum(UnitsInStock),
          avg(Price),
          min(Price),
          max(MegaPixels),
          max(MaxFocalLength)),
    facet(Price < $p0,
          Price >= $p1 and Price < $p2,
          Price >= $p3 and Price < $p4,
          Price >= $p5 and Price < $p6,
          Price >= $p7,
          sum(UnitsInStock),
          avg(Price),
          min(Price),
          max(MegaPixels),
          max(MaxFocalLength))
{"p0":200.0,"p1":200.0,"p2":400.0,"p3":400.0,"p4":600.0,"p5":600.0,"p6":800.0,"p7":800.0}

Query results:

# The resulting items will contain (Showing partial results):
# ===========================================================

# For the "brand" Facet:
#     "canon" Count:1, Sum: 30, Name: UnitsInStock
#     "canon" Count:1, Min: 200, Average: 200, Name: Price
#     "canon" Count:1, Max: 30.4, Name: MegaPixels
#     "canon" Count:1, Max: 400, Name: MaxFocalLength

#     "fuji" Count:4, Sum: 42, Name: UnitsInStock
#     "fuji" Count:4, Min: 410, Name: Price
#     "fuji" Count:4, Max: 102, Name: MegaPixels
#     "fuji" Count:4, Max: 800, Name: MaxFocalLength

#     etc.....
#
# For the "Price" Ranges:
#     "Price < 200.0" Count:3, Sum: 17, Name: UnitsInStock
#     "Price < 200.0" Count:3, Min: 100, Average: 133.33, Name: Price
#     "Price < 200.0" Count:3, Max: 32, Name: MegaPixels
#     "Price < 200.0" Count:3, Max: 300, Name: MaxFocalLength

#     "Price < 200.0 and Price > 400.0" Count:5, Sum: 75, Name: UnitsInStock
#     "Price < 200.0 and Price > 400.0" Count:5, Min: 200, Average: 252, Name: Price
#     "Price < 200.0 and Price > 400.0" Count:5, Max: 40, Name: MegaPixels
#     "Price < 200.0 and Price > 400.0" Count:5, Max: 600, Name: MaxFocalLength

#     etc.....
# Get results for the 'brand' facets:
# ========================================
brand_facets = results["brand"]

# Get the aggregated facet value for a specific brand:
facet_value = brand_facets.values[0]
# The brand name is available in the 'Range' property:
self.assertEqual("canon", facet_value.range_)
# The index-field on which aggregation was done is in the 'name' property:
self.assertEqual("units_in_stock", facet_value.name)
# The requested aggregation result
self.assertEqual(30, facet_value.sum_)

# Get results for the 'price' RangeFacets:
# ========================================
price_range_facets = results["price"]

# Get the aggregated facet value for a specific brand:
facet_value = price_range_facets.values[0]
# The range string is available in the 'Range' property:
self.assertEqual("price < 200", facet_value.range_)
# The index-field on which aggregation was done is in the 'Name' property:
self.assertEqual("units_in_stock", facet_value.name)
# The requested aggregation result:
self.assertEqual(17, facet_value.sum_)

Storing facets definition in a document

Define and store facets in a document:

  • The facets definitions can be stored in a document.

  • That document can then be used by a faceted search query.

facet_setup = FacetSetup()
# Provide  the ID of the document in which the facet setup will be stored.
# This is optional -
# if not provided then the session will assign an ID for the stored document.
facet_setup.Id = "customDocumentID"

# Define Facets and RangeFacets to query by:
facet = Facet("brand")
range_facet = RangeFacet()
range_facet.ranges = [
    "mega_pixels < 20",
    "mega_pixels between 20 and 30",
    "mega_pixels between 30 and 50",
    "mega_pixels >= 50",
]

facet_setup.facets = [facet]
facet_setup.range_facets = [range_facet]

# Store the facet setup document and save changes:
# ===============================================
session.store(facet_setup)
session.save_changes()

# The document will be stored under the 'FacetSetups' collection

Query using facets from document:

results = (
    session
    # Query the index
    .query_index_type(Cameras_ByFeatures, Cameras_ByFeatures.IndexEntry)
    # Call 'aggregate_using'
    # Pass the ID of the document that contains your facets setup
    .aggregate_using("customDocumentID").execute()
)
results = (
    session.advanced
    # Query the index
    # Provide the RQL string to the raw_query method
    .raw_query("from index 'Cameras/ByFeatures' select facet(id('customDocumentID'))", Camera)
    # Execute the query
    .execute_aggregation()
)
from index "Cameras/ByFeatures"
select facet(id("customDocumentID"))

Syntax

def aggregate_by(
    self, builder_or_facet: Union[Callable[[FacetBuilder], None], FacetBase]
) -> AggregationDocumentQuery[_T]: ...

def aggregate_by_facets(self, facets: List[FacetBase]) -> AggregationDocumentQuery[_T]: ...

def aggregate_using(self, facet_setup_document_id: str) -> AggregationDocumentQuery[_T]: ...
Parameter Type Description
builder_or_facet (Union) Callable[[FacetBuilder]
or
FacetBase
Builder with a fluent API that constructs a FacetBase implementation instance
or
FacetBase implementation instance
facets List[FacetBase] A list of FacetBase implementations instances.
facet_setup_document_id str ID of a document containing FacetSetup

class Facet(FacetBase):
    def __init__(self, field_name: str = None):
        super().__init__()
        self.field_name = field_name
class RangeFacet(FacetBase):
    def __init__(self, parent: Optional[FacetBase] = None):
        super().__init__()
        self.ranges: List[str] = []
class FacetBase(ABC):
    def __init__(self):
        self.display_field_name: Union[None, str] = None
        self.options: Union[None, FacetOptions] = None
        self.aggregations: Dict[FacetAggregation, Set[FacetAggregationField]] = {}
class FacetAggregation(enum.Enum):
    NONE = "None"
    MAX = "Max"
    MIN = "Min"
    AVERAGE = "Average"
    SUM = "Sum"

Fluent API builder methods:

def by_ranges(self, range_: RangeBuilder, *ranges: RangeBuilder) -> FacetOperations[_T]: ...

def by_field(self, field_name: str) -> FacetOperations[_T]: ...

def with_display_name(self, display_name: str) -> FacetOperations[_T]: ...

def with_options(self, options: FacetOptions) -> FacetOperations[_T]: ...

def sum_on(self, path: str, display_name: Optional[str] = None) -> FacetOperations[_T]: ...

def min_on(self, path: str, display_name: Optional[str] = None) -> FacetOperations[_T]: ...

def max_on(self, path: str, display_name: Optional[str] = None) -> FacetOperations[_T]: ...

def average_on(self, path: str, display_name: Optional[str] = None) -> FacetOperations[_T]: ...
Parameter Type Description
range_ RangeBuilder A range of indexes
*ranges RangeBuilder Multiple index ranges (at least one), separated by ,
field_name str The index-field to use for the facet
path str Points to the index-field to use for the facet (ByRanges, ByField) or for the aggregation (SUM_ON, MIN_ON, MAX_ON, AVERAGE_ON)
display_name str If set, results of a facet will be returned under this name
options FacetOptions Non-default options to use in the facet definition

Options:

class FacetOptions:
    def __init__(self):
        self.page_size: int = constants.int_max
        self.start: Union[None, int] = None
        self.term_sort_mode: FacetTermSortMode = FacetTermSortMode.VALUE_ASC
        self.include_remaining_terms: bool = False
Option Type Description
term_sort_mode FacetTermSortMode Set the sort order on the resulting items
(VALUE_ASC (Default), VALUE_DESC, COUNT_ASC, COUNT_DESC)
start int The position from which to send items (how many to skip)
page_size int Number of items to return
include_remaining_terms bool Indicates if remaining terms that didn't make it into the requested PageSize should be included in results
Default value: False