我使用了预训练的Deberta-v3-base模型,并在Few-NERD上进行了微调。Few-NERD是一个包含超过18万个示例和超过460万个标记的NER数据集。
标记的类别有:人名、组织机构、地点、建筑物、事件、产品、艺术与杂项。
from transformers import pipeline def print_ner(sentences): """Cleaning and printing NER results """ for sentence in sentences: last_entity_type = sentence[0]['entity'] last_index = sentence[0]['index'] word = sentence[0]['word'] for i, token in enumerate(sentence): if (i > 0): if (token['entity'] == last_entity_type) and (token['index'] == last_index + 1): word = word + '' + token['word'] else: word = word.replace('▁', ' ') print(f"{word[1:]} {last_entity_type}") word = token['word'] last_entity_type = token['entity'] last_index = token['index'] if i == len(sentence) - 1: word = word.replace('▁', ' ') print(f"{word[1:]} {last_entity_type}") pipe = pipeline(model='RashidNLP/NER-Deberta') sentence = pipe(["Elon Musk will be at SpaceX's Starbase facility in Boca Chica for the orbital launch of starship next month"]) print_ner(sentence)