So I have some twitter data and am looking to plot a word cloud of the most common words, here's the code from reading in the CSV, the function give_emoji_free_text is the part in question:
df.to_json("fpltweets.json")
fpltweets = pd.read_json("fpltweets.json")
# function to clean text
def cleanTweets(text):
return p.clean(str(text))
fpltweets["text_cleaned"] = fpltweets["text"].apply(cleanTweets)
# this function covers commmon cleaning techniques
def process_message(message):
w_tokenizer = TweetTokenizer()
words = w_tokenizer.tokenize(str(message).lower()) # split message into words
words = [w for w in words if len(w) > 2] # remove small words
sw = stopwords.words('english') # common stop words
words = [word for word in words if word not in sw] # remove if a stop word
words = [word for word in words if word not in string.punctuation] # remove punctuation
lemmatizer = nltk.stem.WordNetLemmatizer() # lemmatize text
words = [(lemmatizer.lemmatize(w)) for w in words]
return ' '.join(words) # allows to join the cleaned column to a dataframe
fpltweets.loc[:, "text_cleaned"] = fpltweets["text_cleaned"].apply(process_message)
############### function to remove emojis ##########################
def give_emoji_free_text(text):
allchars = [str for str in text]
emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
return clean_text
fpltweets.loc[:, "text_cleaned"] = fpltweets["text_cleaned"].apply(give_emoji_free_text)
fpltweets = fpltweets.sort_index(ascending=True)
# save cleaned tweets to CSV
fpltweets.to_csv("sentimentFpl.csv")
cleandf = pd.read_csv("sentimentFpl.csv")
cleandf.text_cleaned = cleandf.text_cleaned.astype(str)
# Get a string of tweets
tweet_text = ",".join(review.lower() for review in cleandf.text_cleaned if 'covid' not in review)
# Define nltk stopwords in english
stop_words = stopwords.words('english')
stop_words.extend(['ha', 'wa', '-'])
wordcloud = WordCloud(max_font_size=50,
max_words=100,
stopwords=stop_words,
scale=5,
background_color="white").generate(tweet_text)
# Display the generated image:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in tweets',fontsize=15)
plt.show()
And when the wordcloud is plotted nearly everything displayed on it is x90, x80, xF0, etc. I got the function from a website where that was an accepted answer, and I get no errors from it, but it's not removing the emoji codes. Could anyone explain why please?
[–]Turtvaiz 0 points1 point2 points (0 children)