import pandas as pd

# 讀取原始檔案
df = pd.read_csv("baishatun_text.csv")  #threads
df = pd.read_csv("FB_baishatun.csv")  # FB posts

# 定義簡易的中文情緒關鍵字
positive_keywords = ["喜歡", "開心", "感謝", "愛", "棒", "熱情", "推薦", "讚", "歡迎", "期待", "幸福", "平安", "媽祖保佑", "感動"]
negative_keywords = ["失望", "生氣", "糟糕", "討厭", "爛", "壞", "崩潰", "無言", "難過", "抱怨", "痛苦", "憤怒"]
# 中立詞不實際標註，只當參考

# 判斷情感函式
def simple_sentiment(text):
    pos = any(word in str(text) for word in positive_keywords)
    neg = any(word in str(text) for word in negative_keywords)
    
    if pos and not neg:
        return "正向"
    elif neg and not pos:
        return "負向"
    elif pos and neg:
        return "中立"
    else:
        return "中立"

# 套用情感標註
df["情感標註"] = df["text"].apply(simple_sentiment)

# 輸出成新的 CSV 檔案
df.to_csv("fb_baishatun_text_label_1.csv", encoding='utf-8-sig', index=False)
print("情感標註完成，已輸出為 'fb_baishatun_text_label_1.csv'")

情感標註完成，已輸出為 'fb_baishatun_text_label_1.csv'

import matplotlib.pyplot as plt

def add_labels(x, y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i])  # Placing text 

# Threads
df_t=pd.read_csv("baishatun_text_label_1.csv")
dict_t={}
for i in range(df_t.shape[0]):
    sent=df_t.iat[i,1]
    if sent in dict_t:
        dict_t[sent]+=1
    else:
        dict_t[sent]=1
print("threads",dict_t)
sent_t=list(dict_t.keys())  # 標籤數據 Chinese
sent_t_count=list(dict_t.values())

# FB
dict1={}
for i in range(df.shape[0]):
    sent=df.iat[i,1]
    if sent in dict1:
        dict1[sent]+=1
    else:
        dict1[sent]=1
print("Facebook",dict1)

sent1=list(dict1.keys())  # 標籤數據 Chinese
sent2=['Neu','P','N']
sent_count=list(dict1.values())
color = ['r','b','y']   # 顏色數據
bar=0.3
x1=[1,2,3] 
x2=[x+bar for x in x1]
plt.figure(figsize=(5,3))
plt.bar(x1,sent_count,tick_label=sent2,width=bar,label="Facebook")  # 加入顏色、標籤和寬度參數
plt.bar(x2,sent_t_count,width=bar,label="Threads")
add_labels(x1, sent_count)
add_labels(x2,sent_t_count)
plt.title("The number of every class")
plt.legend()
plt.show()

threads {'正向': 3384, '中立': 4896, '負向': 175}
Facebook {'中立': 167, '正向': 147, '負向': 2}

df = pd.read_csv("FB_baishatun.csv")  # FB posts
print(df.shape)

(316, 1)

# calculate TF
# 清除非中文文字（保留中文字）
import pandas as pd
import jieba
import re
from collections import Counter

# 停用詞表（可擴充）
stopwords = set([
    "的", "了", "是", "在", "我", "有", "和", "也", "就", "不", "都", "人", "到", "一", "你", "說", "為", "個", "上", "這", "中"])

def clean_text(text):
    return re.sub(r"[^\u4e00-\u9fff]+", "", str(text))

# 「白沙屯進香」語境下的專用詞典
jieba.load_userdict("userdict.txt")

# 斷詞並移除停用詞
all_tokens = []
for text in df["text"]:
    cleaned = clean_text(text)
    words = jieba.lcut(cleaned)
    filtered_words = [w for w in words if w not in stopwords and len(w.strip()) > 1]
    all_tokens.extend(filtered_words)

# 統計詞頻
word_freq = Counter(all_tokens)

# 輸出成表格
tf_df = pd.DataFrame(word_freq.items(), columns=["詞彙", "詞頻"]).sort_values(by="詞頻", ascending=False)

# 儲存成 CSV
tf_df.to_csv("baishatun_TF_1.csv", index=False, encoding="utf-8-sig")
print("完成：結果存為 'baishatun_TF_1.csv'")

完成：結果存為 'baishatun_TF_1.csv'

#  calculate TF-IDF and word cloud
import pandas as pd
import jieba
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def clean_and_tokenize(text):
    text = re.sub(r"[^\u4e00-\u9fff]+", "", str(text))  # 保留中文字
    words = jieba.lcut(text)
    filtered = [w for w in words if w not in stopwords and len(w.strip()) > 1]
    return " ".join(filtered)

df["processed"] = df["text"].apply(clean_and_tokenize)

# ========== 3. 建立 TF-IDF 矩陣 ==========
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["processed"])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# ========== 4. 查詢特定詞的 TF-IDF 排名 ==========
target_word = "媽祖"
if target_word in tfidf_df.columns:
    df_word_score = tfidf_df[[target_word]]
    ranked = df_word_score.sort_values(by=target_word, ascending=False).reset_index()
    print(f"🔍 TF-IDF 排名前 10 的貼文（詞：{target_word}）")
    print(ranked.head(10))
else:
    print(f"❌ 詞彙「{target_word}」未出現在文件中")

# ========== 5. 建立整體 TF-IDF 加總的詞雲 ==========
# 計算所有文件中每個詞的 TF-IDF 總和
word_weights = tfidf_df.sum(axis=0).to_dict()

# 產生詞雲
wc = WordCloud(
    font_path="C:/Windows/Fonts/msjh.ttc",  # ← 微軟正黑體，幾乎每台 Windows 都有
    background_color="white",
    width=800,
    height=600
).generate_from_frequencies(word_weights)

plt.figure(figsize=(10, 7))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("TF-IDF WordCloud")
plt.show()

🔍 TF-IDF 排名前 10 的貼文（詞：媽祖）
   index        媽祖
0    203  0.458540
1    204  0.458540
2    205  0.458540
3    294  0.374554
4    119  0.339020
5    227  0.308786
6    232  0.306935
7    277  0.279344
8    148  0.277896
9     27  0.277896