from datasets import load_dataset
from semhash import SemHash
# Load a dataset to deduplicate
texts = load_dataset("ag_news", split="train")["text"]
# Initialize a SemHash instance
semhash = SemHash.from_records(records=texts)
# Deduplicate the texts
deduplicated_texts = semhash.self_deduplicate().selected
# Filter outliers
filtered_texts = semhash.self_filter_outliers().selected
# Find representative texts
representative_texts = semhash.self_find_representative().selected