模型:
cardiffnlp/twitter-roberta-large-2022-154m
这是一个在2022年12月底之前训练的RoBERTa-large模型,使用了154M条推特数据(原始检查点,没有增量更新)。
这154M条推特来自于从Twitter学术API中获取的220M条推特,涵盖了2018年1月至2022年12月的每个月份。过滤和预处理的详细信息可以在这里找到。
下面,我们提供了一些使用标准Transformers接口的示例用法。如果您希望采用另一种接口来比较在不同时间间隔训练的模型之间的预测和困惑度得分,请检查这里。
如果您想保留在训练期间也保留的已验证用户,可以保留这里列出的用户。
def preprocess(text): preprocessed_text = [] for t in text.split(): if len(t) > 1: t = '@user' if t[0] == '@' and t.count('@') == 1 else t t = 'http' if t.startswith('http') else t preprocessed_text.append(t) return ' '.join(preprocessed_text)
用占位符“@user”和“http”替换用户名和链接。如果您有兴趣保留已验证的用户(在训练期间也保留了),可以保留这里列出的用户。
def preprocess(text): preprocessed_text = [] for t in text.split(): if len(t) > 1: t = '@user' if t[0] == '@' and t.count('@') == 1 else t t = 'http' if t.startswith('http') else t preprocessed_text.append(t) return ' '.join(preprocessed_text)
from transformers import pipeline, AutoTokenizer MODEL = "cardiffnlp/twitter-roberta-large-2022-154m" fill_mask = pipeline("fill-mask", model=MODEL, tokenizer=MODEL) tokenizer = AutoTokenizer.from_pretrained(MODEL) def pprint(candidates, n): for i in range(n): token = tokenizer.decode(candidates[i]['token']) score = candidates[i]['score'] print("%d) %.5f %s" % (i+1, score, token)) texts = [ "So glad I'm <mask> vaccinated.", "I keep forgetting to bring a <mask>.", "Looking forward to watching <mask> Game tonight!", ] for text in texts: t = preprocess(text) print(f"{'-'*30}\n{t}") candidates = fill_mask(t) pprint(candidates, 5)
输出:
------------------------------ So glad I'm <mask> vaccinated. 1) 0.37136 fully 2) 0.20631 a 3) 0.09422 the 4) 0.07649 not 5) 0.04505 already ------------------------------ I keep forgetting to bring a <mask>. 1) 0.10507 mask 2) 0.05810 pen 3) 0.05142 charger 4) 0.04082 tissue 5) 0.03955 lighter ------------------------------ Looking forward to watching <mask> Game tonight! 1) 0.45783 The 2) 0.32842 the 3) 0.02705 Squid 4) 0.01157 Big 5) 0.00538 Match
from transformers import AutoTokenizer, AutoModel, TFAutoModel import numpy as np from scipy.spatial.distance import cosine from collections import Counter def get_embedding(text): # naive approach for demonstration text = preprocess(text) encoded_input = tokenizer(text, return_tensors='pt') features = model(**encoded_input) features = features[0].detach().cpu().numpy() return np.mean(features[0], axis=0) MODEL = "cardiffnlp/twitter-roberta-large-2022-154m" tokenizer = AutoTokenizer.from_pretrained(MODEL) model = AutoModel.from_pretrained(MODEL) query = "The book was awesome" tweets = ["I just ordered fried chicken ?", "The movie was great", "What time is the next game?", "Just finished reading 'Embeddings in NLP'"] sims = Counter() for tweet in tweets: sim = 1 - cosine(get_embedding(query), get_embedding(tweet)) sims[tweet] = sim print('Most similar to: ', query) print(f"{'-'*30}") for idx, (tweet, sim) in enumerate(sims.most_common()): print("%d) %.5f %s" % (idx+1, sim, tweet))
输出:
Most similar to: The book was awesome ------------------------------ 1) 0.99820 The movie was great 2) 0.99306 Just finished reading 'Embeddings in NLP' 3) 0.99257 What time is the next game? 4) 0.98561 I just ordered fried chicken ?
from transformers import AutoTokenizer, AutoModel, TFAutoModel import numpy as np MODEL = "cardiffnlp/twitter-roberta-large-2022-154m" tokenizer = AutoTokenizer.from_pretrained(MODEL) text = "Good night ?" text = preprocess(text) # Pytorch model = AutoModel.from_pretrained(MODEL) encoded_input = tokenizer(text, return_tensors='pt') features = model(**encoded_input) features = features[0].detach().cpu().numpy() features_mean = np.mean(features[0], axis=0) #features_max = np.max(features[0], axis=0) # # Tensorflow # model = TFAutoModel.from_pretrained(MODEL) # encoded_input = tokenizer(text, return_tensors='tf') # features = model(encoded_input) # features = features[0].numpy() # features_mean = np.mean(features[0], axis=0) # #features_max = np.max(features[0], axis=0)