In [1]:
import pandas as pd
# 讀取原始檔案
df = pd.read_csv("baishatun_text.csv") #threads
df = pd.read_csv("FB_baishatun.csv") # FB posts
# 定義簡易的中文情緒關鍵字
positive_keywords = ["喜歡", "開心", "感謝", "愛", "棒", "熱情", "推薦", "讚", "歡迎", "期待", "幸福", "平安", "媽祖保佑", "感動"]
negative_keywords = ["失望", "生氣", "糟糕", "討厭", "爛", "壞", "崩潰", "無言", "難過", "抱怨", "痛苦", "憤怒"]
# 中立詞不實際標註,只當參考
# 判斷情感函式
def simple_sentiment(text):
pos = any(word in str(text) for word in positive_keywords)
neg = any(word in str(text) for word in negative_keywords)
if pos and not neg:
return "正向"
elif neg and not pos:
return "負向"
elif pos and neg:
return "中立"
else:
return "中立"
# 套用情感標註
df["情感標註"] = df["text"].apply(simple_sentiment)
# 輸出成新的 CSV 檔案
df.to_csv("fb_baishatun_text_label_1.csv", encoding='utf-8-sig', index=False)
print("情感標註完成,已輸出為 'fb_baishatun_text_label_1.csv'")
情感標註完成,已輸出為 'fb_baishatun_text_label_1.csv'
In [19]:
import matplotlib.pyplot as plt
def add_labels(x, y):
for i in range(len(x)):
plt.text(i, y[i], y[i]) # Placing text
# Threads
df_t=pd.read_csv("baishatun_text_label_1.csv")
dict_t={}
for i in range(df_t.shape[0]):
sent=df_t.iat[i,1]
if sent in dict_t:
dict_t[sent]+=1
else:
dict_t[sent]=1
print("threads",dict_t)
sent_t=list(dict_t.keys()) # 標籤數據 Chinese
sent_t_count=list(dict_t.values())
# FB
dict1={}
for i in range(df.shape[0]):
sent=df.iat[i,1]
if sent in dict1:
dict1[sent]+=1
else:
dict1[sent]=1
print("Facebook",dict1)
sent1=list(dict1.keys()) # 標籤數據 Chinese
sent2=['Neu','P','N']
sent_count=list(dict1.values())
color = ['r','b','y'] # 顏色數據
bar=0.3
x1=[1,2,3]
x2=[x+bar for x in x1]
plt.figure(figsize=(5,3))
plt.bar(x1,sent_count,tick_label=sent2,width=bar,label="Facebook") # 加入顏色、標籤和寬度參數
plt.bar(x2,sent_t_count,width=bar,label="Threads")
add_labels(x1, sent_count)
add_labels(x2,sent_t_count)
plt.title("The number of every class")
plt.legend()
plt.show()
threads {'正向': 3384, '中立': 4896, '負向': 175}
Facebook {'中立': 167, '正向': 147, '負向': 2}
In [22]:
df = pd.read_csv("FB_baishatun.csv") # FB posts
print(df.shape)
(316, 1)
In [23]:
# calculate TF
# 清除非中文文字(保留中文字)
import pandas as pd
import jieba
import re
from collections import Counter
# 停用詞表(可擴充)
stopwords = set([
"的", "了", "是", "在", "我", "有", "和", "也", "就", "不", "都", "人", "到", "一", "你", "說", "為", "個", "上", "這", "中"])
def clean_text(text):
return re.sub(r"[^\u4e00-\u9fff]+", "", str(text))
# 「白沙屯進香」語境下的專用詞典
jieba.load_userdict("userdict.txt")
# 斷詞並移除停用詞
all_tokens = []
for text in df["text"]:
cleaned = clean_text(text)
words = jieba.lcut(cleaned)
filtered_words = [w for w in words if w not in stopwords and len(w.strip()) > 1]
all_tokens.extend(filtered_words)
# 統計詞頻
word_freq = Counter(all_tokens)
# 輸出成表格
tf_df = pd.DataFrame(word_freq.items(), columns=["詞彙", "詞頻"]).sort_values(by="詞頻", ascending=False)
# 儲存成 CSV
tf_df.to_csv("baishatun_TF_1.csv", index=False, encoding="utf-8-sig")
print("完成:結果存為 'baishatun_TF_1.csv'")
完成:結果存為 'baishatun_TF_1.csv'
In [24]:
# calculate TF-IDF and word cloud
import pandas as pd
import jieba
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
def clean_and_tokenize(text):
text = re.sub(r"[^\u4e00-\u9fff]+", "", str(text)) # 保留中文字
words = jieba.lcut(text)
filtered = [w for w in words if w not in stopwords and len(w.strip()) > 1]
return " ".join(filtered)
df["processed"] = df["text"].apply(clean_and_tokenize)
# ========== 3. 建立 TF-IDF 矩陣 ==========
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["processed"])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
# ========== 4. 查詢特定詞的 TF-IDF 排名 ==========
target_word = "媽祖"
if target_word in tfidf_df.columns:
df_word_score = tfidf_df[[target_word]]
ranked = df_word_score.sort_values(by=target_word, ascending=False).reset_index()
print(f"🔍 TF-IDF 排名前 10 的貼文(詞:{target_word})")
print(ranked.head(10))
else:
print(f"❌ 詞彙「{target_word}」未出現在文件中")
# ========== 5. 建立整體 TF-IDF 加總的詞雲 ==========
# 計算所有文件中每個詞的 TF-IDF 總和
word_weights = tfidf_df.sum(axis=0).to_dict()
# 產生詞雲
wc = WordCloud(
font_path="C:/Windows/Fonts/msjh.ttc", # ← 微軟正黑體,幾乎每台 Windows 都有
background_color="white",
width=800,
height=600
).generate_from_frequencies(word_weights)
plt.figure(figsize=(10, 7))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("TF-IDF WordCloud")
plt.show()
🔍 TF-IDF 排名前 10 的貼文(詞:媽祖) index 媽祖 0 203 0.458540 1 204 0.458540 2 205 0.458540 3 294 0.374554 4 119 0.339020 5 227 0.308786 6 232 0.306935 7 277 0.279344 8 148 0.277896 9 27 0.277896