Skip to main content
By default, SemHash uses potion-base-8M as its embedding model for text. This model is very fast, and works well for most use cases. However, you can easily swap the model for text, images, or any other modality. Any model that follows our encoder protocol will work. The encoder protocol requires only an encode(inputs, **kwargs) method that returns a numpy array. This makes it easy to integrate any embedding model for any modality. We recommend using potion-multilingual-128M for multilingual text datasets.

Using a Model2Vec model

The following example shows how to use a Model2Vec model with SemHash:
 from datasets import load_dataset
from model2vec import StaticModel
from semhash import SemHash

# Load a dataset to deduplicate
texts = load_dataset("ag_news", split="train")["text"]

# Load an embedding model (in this example, a multilingual model)
model = StaticModel.from_pretrained("minishlab/potion-multilingual-128M")

# Initialize a SemHash with the model and custom encoder
semhash = SemHash.from_records(records=texts, model=model)

# Deduplicate the texts
deduplicated_texts = semhash.self_deduplicate()

Using a Sentence Transformer

The following example shows how to use a Sentence Transformer with SemHash:
from datasets import load_dataset
from semhash import SemHash
from sentence_transformers import SentenceTransformer

# Load a dataset to deduplicate
texts = load_dataset("ag_news", split="train")["text"]

# Load a sentence-transformers model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Initialize a SemHash with the model and custom encoder
semhash = SemHash.from_records(records=texts, model=model)

# Deduplicate the texts
deduplicated_texts = semhash.self_deduplicate()

Using Vision Models for Images

You can create custom encoders for any modality by implementing the encoder protocol. Here’s an example using a vision model from timm for image deduplication:
from datasets import load_dataset
import timm
import torch
from semhash import SemHash

# Create a custom image encoder
class VisionEncoder:
    """Custom encoder using timm models. Implements the Encoder protocol."""

    def __init__(self, model_name: str = "mobilenetv3_small_100.lamb_in1k"):
        self.model = timm.create_model(model_name, pretrained=True, num_classes=0).eval()
        data_config = timm.data.resolve_model_data_config(self.model)
        self.transform = timm.data.create_transform(**data_config, is_training=False)

    def encode(self, inputs, batch_size: int = 128):
        """Encode a batch of PIL images into embeddings."""
        import numpy as np

        # Convert grayscale to RGB if needed
        rgb_inputs = [img.convert("RGB") if img.mode != "RGB" else img for img in inputs]

        # Process in batches to avoid memory issues
        all_embeddings = []
        with torch.no_grad():
            for i in range(0, len(rgb_inputs), batch_size):
                batch_inputs = rgb_inputs[i : i + batch_size]
                batch = torch.stack([self.transform(img) for img in batch_inputs])
                embeddings = self.model(batch).numpy()
                all_embeddings.append(embeddings)

        return np.vstack(all_embeddings)

# Load image dataset
dataset = load_dataset("uoft-cs/cifar10", split="test")
train_data = [{"img": img, "id": i} for i, img in enumerate(dataset["img"][:100])]
test_data = [{"img": img, "id": i} for i, img in enumerate(dataset["img"][100:150])]

# Initialize SemHash with the custom vision encoder
semhash = SemHash.from_records(train_data, columns=["img"], model=VisionEncoder())

# Single-dataset operations
deduplicated = semhash.self_deduplicate().selected
outliers = semhash.self_filter_outliers().selected
representatives = semhash.self_find_representative().selected

# Cross-dataset operations
test_deduplicated = semhash.deduplicate(test_data).selected
test_outliers = semhash.filter_outliers(test_data).selected
test_representatives = semhash.find_representative(test_data, selection_size=10).selected
You can also use CLIP models from sentence-transformers for image deduplication:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from semhash import SemHash

# CLIP models work with images out of the box
model = SentenceTransformer('clip-ViT-B-32')
dataset = load_dataset("uoft-cs/cifar10", split="test")

# Initialize a SemHash instance with the 'img' column
semhash = SemHash.from_records(list(dataset)[:100], columns=["img"], model=model)

# Deduplicate the images
deduplicated_images = semhash.self_deduplicate().selected