BERT Word Embeddings
Libraries¶
In [2]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import matplotlib.pyplot as plt
%matplotlib inline
Load a pre-trained takenizer model¶
In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
Create a sample text¶
In [10]:
# text = "This is a sample text"
text = "This is the sample sentence for BERT word embeddings"
marked_text = "[CLS] " + text + " [SEP]"
print (marked_text)
Tokenization¶
In [11]:
tokenized_text = tokenizer.tokenize(marked_text)
print (tokenized_text)
Convert tokens to ID¶
In [12]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
for tup in zip(tokenized_text, indexed_tokens):
print(tup)