模型:
facebook/contriever
该模型是根据《 Towards Unsupervised Dense Information Retrieval with Contrastive Learning 》中描述的方法进行无监督训练的。相关的GitHub存储库在此处 https://github.com/facebookresearch/contriever 可用。
直接在HuggingFace Transformers中使用该模型需要添加均值池操作以获得句子嵌入。
import torch from transformers import AutoTokenizer, AutoModel tokenizer = AutoTokenizer.from_pretrained('facebook/contriever') model = AutoModel.from_pretrained('facebook/contriever') sentences = [ "Where was Marie Curie born?", "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.", "Born in Paris on 15 May 1859, Pierre Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace." ] # Apply tokenizer inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') # Compute token embeddings outputs = model(**inputs) # Mean pooling def mean_pooling(token_embeddings, mask): token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.) sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None] return sentence_embeddings embeddings = mean_pooling(outputs[0], inputs['attention_mask'])