python 機器學習 line 幹話辨別

最近搞機器學習, 想說用簡單的 project 來玩看看, 沒想到還滿有趣的 LOL

首先用 python 把對話紀錄轉為 csv, 接著工人智慧開始標記幹話, 正常為 normal 幹話則為 trash

import csv
import re

input_file = "對話紀錄.txt"
output_file = "1對話紀錄.csv"

# 解析格式：時間 + 人名 + 訊息，例如：
# 20:28    Kay    小組名單
pattern = re.compile(r"^(\d{1,2}:\d{2})\s+(.+?)\s+(.*)$")

rows = []
current = {"name": None, "msg": ""}

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.rstrip("\n")

        match = pattern.match(line)

        if match:
            # 把上一筆存起來
            if current["name"]:
                rows.append(current)

            _, name, msg = match.groups()  # 時間不要了
            current = {"name": name, "msg": msg}

        else:
            # 換行訊息 → 加到上一筆 msg
            current["msg"] += "\n" + line

    # 最後一筆加入
    if current["name"]:
        rows.append(current)

# 寫入 CSV（含 label）
with open(output_file, "w", newline="", encoding="utf-8-sig") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["name", "message", "label"])

    for r in rows:
        writer.writerow([r["name"], r["msg"], "normal"])  # 預設 normal

print("完成！已輸出：", output_file)

接著在 postgresql 建立資料表

CREATE DATABASE trashtalk;

CREATE TABLE line_messages (
    id SERIAL PRIMARY KEY,
    name TEXT,
    message TEXT,
    label TEXT
);

用 PSQL Tool 執行命令匯入資料

1	\copy line_messages(name, message, label) FROM 'C:\\data\\yourmessage.csv' DELIMITER ',' CSV HEADER ENCODING 'UTF8'

也可以用 sql 看看誰是幹話王, 測出來的結果不意外.. 心中認定的幹話王果然是 top1 竟然有 40.654205607476634% 的機率是幹話!
但是只看一個群組或是對話內容不准, 最好多幾個, 不過測起來我自己的幹話竟然也有 25% = =
本來我還以為自己是個不太講幹話的人 LOL

select name , count(*) , 
	(select count(*) from line_messages i where i.name = o.name and label = 'normal') ,
	(select count(*) from line_messages i where i.name = o.name and label = 'trash') ,
	    (COUNT(*) FILTER (WHERE label = 'trash')::double precision
     / NULLIF(COUNT(*)::double precision, 0)) * 100 AS trash_percentage
from line_messages o
group by name
order by 5 desc

然後用以下程式碼訓練訓練模型

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import jieba

# 連線 postgresql
engine = create_engine(
    "postgresql+psycopg2://postgres:postgres@localhost:5432/trashtalk"
)

# 查出內容
data = pd.read_sql_query("select * from line_messages",engine)

# 1. Label 編碼
le = LabelEncoder()
data["label"] = le.fit_transform(data["label"])

X = data["message"]
y = data["label"]

# 2. 切分訓練/測試
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. 中文斷詞函數
def chinese_tokenizer(text):
    return list(jieba.cut(text))

# 4. TfidfVectorizer 使用中文斷詞
text_feature_extraction = TfidfVectorizer(
    tokenizer=chinese_tokenizer, max_features=5000, ngram_range=(1, 2)
)

# 5. 建立 pipeline（去掉 multi_class）
model = LogisticRegression(solver="lbfgs", max_iter=1000, class_weight='balanced')
pipeline = Pipeline([("tfidf", text_feature_extraction), ("model", model)])

# 6. 訓練模型
pipeline.fit(X_train, y_train)

# 7. 測試準確率
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# 8. 中文訊息預測函數（回傳文字標籤）
def trash_talk_prediction(message):
    input_data = [message]
    prediction = pipeline.predict(input_data)
    label_name = le.inverse_transform(prediction)  # 轉回文字標籤
    return label_name[0]

測試訓練結果看看是否跟自己的 幹話 或是 正常 區分得出來, 果然幹話丟進去都是幹話, 讚!

print(trash_talk_prediction("建議導入送餐機器人"))
print(trash_talk_prediction("建議加上直排輪，移動上較為方便"))
print(trash_talk_prediction("沒這麼胖的機器人🤣🤣🤣"))
print(trash_talk_prediction("孔鏘老祖嗎？"))
print(trash_talk_prediction("杯麵來了~~"))

後來想搞個 fastapi 然後又噴奇怪的錯誤了, 發現不能把 chinese_tokenizer 直接在 jupyter 裡面訓練, 要額外抽一個模組, 不然會噴這個錯噴到死, 搞超久 - . -“

1	AttributeError: Can't get attribute 'chinese_tokenizer' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>

tokenizer_module.py

import jieba
from snownlp import SnowNLP

def chinese_tokenizer(text):
    return list(jieba.cut(text))

# def chinese_tokenizer(text):
#     s = SnowNLP(text)
#     return list(s.words)

train.py


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from tokenizer_module import chinese_tokenizer

import joblib
from snownlp import SnowNLP

# 連線 postgresql
engine = create_engine(
    "postgresql+psycopg2://postgres:postgres@localhost:5432/trashtalk"
)

# 查出內容
data = pd.read_sql_query("select * from line_messages",engine)

# 1. Label 編碼
le = LabelEncoder()
data["label"] = le.fit_transform(data["label"])

X = data["message"]
y = data["label"]

# 2. 切分訓練/測試
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# 4. TfidfVectorizer 使用中文斷詞
text_feature_extraction = TfidfVectorizer(
    # tokenizer=chinese_tokenizer, max_features=5000, ngram_range=(1, 2)
    tokenizer=chinese_tokenizer, max_features=5000, ngram_range=(1, 2)
)

# 5. 建立 pipeline（去掉 multi_class）
model = LogisticRegression(solver="lbfgs", max_iter=1000, class_weight='balanced')
pipeline = Pipeline([("tfidf", text_feature_extraction), ("model", model)])

# 6. 訓練模型
pipeline.fit(X_train, y_train)

# 7. 測試準確率
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



# 將整個 pipeline 儲存成 pkl
joblib.dump(pipeline, "trash_talk_pipeline.pkl")
joblib.dump(le, "label_encoder.pkl")

trashapi.py

from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import jieba
from snownlp import SnowNLP
from tokenizer_module import chinese_tokenizer  # 🔹 這行非常重要

# -------------------------------
# 載入模型
# -------------------------------
pipeline = joblib.load("trash_talk_pipeline.pkl")
le = joblib.load("label_encoder.pkl")

# -------------------------------
# FastAPI 初始化
# -------------------------------
app = FastAPI(title="Trash Talk Classifier API")

# -------------------------------
# 請求資料模型
# -------------------------------
class MessageRequest(BaseModel):
    message: str

# -------------------------------
# 預測路由
# -------------------------------
@app.post("/predict")
def predict(request: MessageRequest):
    text = [request.message]
    
    # 預測數字 label
    pred_num = pipeline.predict(text)
    
    # 轉回文字 label
    pred_label = le.inverse_transform(pred_num)[0]
    
    # 預測機率
    proba = pipeline.predict_proba(text)[0]
    
    # 對應文字 label 的機率字典
    scores = {cls: float(score) for cls, score in zip(le.classes_, proba)}
    
    return {
        "message": request.message,
        "predicted_label": pred_label,
        "scores": scores
    }

post 後吐出內容

{
  "message": "已經找很多鏟子了，而鏟子超人已經去嗨了！",
  "predicted_label": "trash",
  "scores": {
    "normal": 0.17322229156229862,
    "trash": 0.8267777084377014
  }
}