from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from semhash import SemHash
# Load an image dataset and vision model
model = SentenceTransformer("clip-ViT-B-32")
dataset = load_dataset("uoft-cs/cifar10", split="train")
# Initialize a SemHash instance with the 'img' column
semhash = SemHash.from_records(list(dataset), columns=["img"], model=model)
# Deduplicate the images
deduplicated_images = semhash.self_deduplicate().selected
# Filter outliers
filtered_images = semhash.self_filter_outliers().selected
# Find representative images
representative_images = semhash.self_find_representative().selected