from datasets import load_dataset
from model2vec import StaticModel
from semhash import SemHash
# Load a dataset to deduplicate
texts = load_dataset("ag_news", split="train")["text"]
# Load an embedding model (in this example, a multilingual model)
model = StaticModel.from_pretrained("minishlab/potion-multilingual-128M")
# Initialize a SemHash with the model and custom encoder
semhash = SemHash.from_records(records=texts, model=model)
# Deduplicate the texts
deduplicated_texts = semhash.self_deduplicate()