该模型在保加利亚语子集上进行了微调。
导入库:
from typing import List, Dict import torch from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
首先,您需要定义这些方法,因为我们使用的是子词Tokenizer:
def predict( text: str, model: torch.nn.Module, tokenizer: AutoTokenizer, labels_tags={ 0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC" }) -> List[Dict[str, str]]: tokens_data = tokenizer(text) tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"]) words = subwords_to_words(tokens) input_ids = torch.LongTensor(tokens_data["input_ids"]).unsqueeze(0) attention_mask = torch.LongTensor(tokens_data["attention_mask"]).unsqueeze(0) out = model(input_ids, attention_mask=attention_mask).logits out = out.argmax(-1).squeeze(0).tolist() prediction = [labels_tags[idx] if idx in labels_tags else idx for idx in out] return merge_words_and_predictions(words, prediction) def subwords_to_words(tokens: List[str]) -> List[str]: out_tokens = [] curr_token = "" tags = [] for token in tokens: if token == "[SEP]": curr_token = curr_token.replace("▁", "") out_tokens.append(curr_token) out_tokens.append("[SEP]") break if "▁" in token and curr_token == "": curr_token += token elif "▁" in token and curr_token != "": curr_token = curr_token.replace("▁", "") out_tokens.append(curr_token) curr_token = "" curr_token += token elif "▁" not in token: curr_token += token return out_tokens def merge_words_and_predictions(words: List[str], entities: List[str]) -> List[Dict[str, str]]: result = [] curr_word = [] for i, (word, entity) in enumerate(zip(words[1:], entities[1:])): if "B-" in entity: if curr_word: curr_word = " ".join(curr_word) result.append({ "word": curr_word, "entity_group": entities[i][2:] }) curr_word = [word] else: curr_word.append(word) if "I-" in entity: curr_word.append(word) if "O" == entity: if curr_word: curr_word = " ".join(curr_word) result.append({ "word": curr_word, "entity_group": entities[i][2:] }) curr_word = [] return result
然后,您应该初始化 AutoTokenizer 和 AutoModelForTokenClassification 对象:
MODEL_ID = "auhide/bert-bg-ner" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
最后,您可以调用上面的 predict() 方法:
text = "Барух Спиноза е роден в Амстердам" print(f"Input: {text}") print("NERs:", predict(text, model=model, tokenizer=tokenizer))
Input: Барух Спиноза е роден в Амстердам NERs: [{'word': 'Барух Спиноза', 'entity_group': 'PER'}, {'word': 'Амстердам', 'entity_group': 'LOC'}]
注意:有三种类型的实体 - PER,ORG,LOC。