模型:
cardiffnlp/twitter-roberta-base-dec2021
这是一个在截至2021年12月末的1.24亿条推文上训练的RoBERTa-base模型。有关更多详细信息和性能得分,请参阅此处。
以下是使用标准Transformers接口的一些使用示例。要使用另一种接口比较在不同时间段训练的模型的预测和困惑度得分,请查看此处。
要了解训练的不同时期的其他模型,请查看此处。
将用户名和链接替换为占位符:“@user”和“http”。如果您有兴趣保留已在训练期间保留的验证用户,可以保留列在此处的用户。
def preprocess(text): preprocessed_text = [] for t in text.split(): if len(t) > 1: t = '@user' if t[0] == '@' and t.count('@') == 1 else t t = 'http' if t.startswith('http') else t preprocessed_text.append(t) return ' '.join(preprocessed_text)
from transformers import pipeline, AutoTokenizer MODEL = "cardiffnlp/twitter-roberta-base-dec2021" fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL) tokenizer = AutoTokenizer.from_pretrained(MODEL) def pprint(candidates, n): for i in range(n): token = tokenizer.decode(candidates[i]['token']) score = candidates[i]['score'] print("%d) %.5f %s" % (i+1, score, token)) texts = [ "So glad I'm <mask> vaccinated.", "I keep forgetting to bring a <mask>.", "Looking forward to watching <mask> Game tonight!", ] for text in texts: t = preprocess(text) print(f"{'-'*30}\n{t}") candidates = fill_mask(t) pprint(candidates, 5)
输出:
------------------------------ So glad I'm <mask> vaccinated. 1) 0.33211 fully 2) 0.26205 not 3) 0.22305 getting 4) 0.03790 still 5) 0.01817 all ------------------------------ I keep forgetting to bring a <mask>. 1) 0.04808 mask 2) 0.04628 book 3) 0.03597 lighter 4) 0.03391 pen 5) 0.02982 knife ------------------------------ Looking forward to watching <mask> Game tonight! 1) 0.34191 Squid 2) 0.23768 the 3) 0.15699 The 4) 0.02766 End 5) 0.01233 this
from transformers import AutoTokenizer, AutoModel, TFAutoModel import numpy as np from scipy.spatial.distance import cosine from collections import Counter def get_embedding(text): # naive approach for demonstration text = preprocess(text) encoded_input = tokenizer(text, return_tensors='pt') features = model(**encoded_input) features = features[0].detach().cpu().numpy() return np.mean(features[0], axis=0) MODEL = "cardiffnlp/twitter-roberta-base-dec2021" tokenizer = AutoTokenizer.from_pretrained(MODEL) model = AutoModel.from_pretrained(MODEL) query = "The book was awesome" tweets = ["I just ordered fried chicken ?", "The movie was great", "What time is the next game?", "Just finished reading 'Embeddings in NLP'"] sims = Counter() for tweet in tweets: sim = 1 - cosine(get_embedding(query), get_embedding(tweet)) sims[tweet] = sim print('Most similar to: ', query) print(f"{'-'*30}") for idx, (tweet, sim) in enumerate(sims.most_common()): print("%d) %.5f %s" % (idx+1, sim, tweet))
输出:
Most similar to: The book was awesome ------------------------------ 1) 0.99004 The movie was great 2) 0.96320 Just finished reading 'Embeddings in NLP' 3) 0.95858 I just ordered fried chicken ? 4) 0.95356 What time is the next game?
from transformers import AutoTokenizer, AutoModel, TFAutoModel import numpy as np MODEL = "cardiffnlp/twitter-roberta-base-dec2021" tokenizer = AutoTokenizer.from_pretrained(MODEL) text = "Good night ?" text = preprocess(text) # Pytorch model = AutoModel.from_pretrained(MODEL) encoded_input = tokenizer(text, return_tensors='pt') features = model(**encoded_input) features = features[0].detach().cpu().numpy() features_mean = np.mean(features[0], axis=0) #features_max = np.max(features[0], axis=0) # # Tensorflow # model = TFAutoModel.from_pretrained(MODEL) # encoded_input = tokenizer(text, return_tensors='tf') # features = model(encoded_input) # features = features[0].numpy() # features_mean = np.mean(features[0], axis=0) # #features_max = np.max(features[0], axis=0)