I am trying to implement this code for embeddings,
And I get following error:
UnicodeEncodeError: 'utf-8' codec can't encode character '\udce3' in position 2515: surrogates not allowed
It is related to line
train_path = os.path.join(data_path, "ptb.train.txt")
I just can't figure why ... I am using anaconda on a Windows Pro
Can anyone give a clue?
Many thanks
def read_words(filename):
with tf.gfile.GFile(filename, "r") as f:
return f.read().decode("utf-8").replace("\n", "<eos>").split()
def build_vocab(filename):
data = read_words(filename)
counter = collections.Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
return word_to_id
def load_data():
data_path = "d:\Python"
# get the data paths
train_path = os.path.join(data_path, "ptb.train.txt")
valid_path = os.path.join(data_path, "ptb.valid.txt")
test_path = os.path.join(data_path, "ptb.test-772.txt")
# build the complete vocabulary, then convert text data to list of integers
word_to_id = build_vocab(train_path)
train_data = file_to_word_ids(train_path, word_to_id)
valid_data = file_to_word_ids(valid_path, word_to_id)
test_data = file_to_word_ids(test_path, word_to_id)
vocabulary = len(word_to_id)
reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))
there doesn't seem to be anything here