0%

python 機器學習 line 幹話辨別

 

最近搞機器學習, 想說用簡單的 project 來玩看看, 沒想到還滿有趣的 LOL

首先用 python 把對話紀錄轉為 csv, 接著工人智慧開始標記幹話, 正常為 normal 幹話則為 trash

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import csv
import re

input_file = "對話紀錄.txt"
output_file = "1對話紀錄.csv"

# 解析格式:時間 + 人名 + 訊息,例如:
# 20:28 Kay 小組名單
pattern = re.compile(r"^(\d{1,2}:\d{2})\s+(.+?)\s+(.*)$")

rows = []
current = {"name": None, "msg": ""}

with open(input_file, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip("\n")

match = pattern.match(line)

if match:
# 把上一筆存起來
if current["name"]:
rows.append(current)

_, name, msg = match.groups() # 時間不要了
current = {"name": name, "msg": msg}

else:
# 換行訊息 → 加到上一筆 msg
current["msg"] += "\n" + line

# 最後一筆加入
if current["name"]:
rows.append(current)

# 寫入 CSV(含 label)
with open(output_file, "w", newline="", encoding="utf-8-sig") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["name", "message", "label"])

for r in rows:
writer.writerow([r["name"], r["msg"], "normal"]) # 預設 normal

print("完成!已輸出:", output_file)

接著在 postgresql 建立資料表

1
2
3
4
5
6
7
8
CREATE DATABASE trashtalk;

CREATE TABLE line_messages (
id SERIAL PRIMARY KEY,
name TEXT,
message TEXT,
label TEXT
);

用 PSQL Tool 執行命令匯入資料

1
\copy line_messages(name, message, label) FROM 'C:\\data\\yourmessage.csv' DELIMITER ',' CSV HEADER ENCODING 'UTF8'

也可以用 sql 看看誰是幹話王, 測出來的結果不意外.. 心中認定的幹話王果然是 top1 竟然有 40.654205607476634% 的機率是幹話!
但是只看一個群組或是對話內容不准, 最好多幾個, 不過測起來我自己的幹話竟然也有 25% = =
本來我還以為自己是個不太講幹話的人 LOL

1
2
3
4
5
6
7
8
select name , count(*) , 
(select count(*) from line_messages i where i.name = o.name and label = 'normal') ,
(select count(*) from line_messages i where i.name = o.name and label = 'trash') ,
(COUNT(*) FILTER (WHERE label = 'trash')::double precision
/ NULLIF(COUNT(*)::double precision, 0)) * 100 AS trash_percentage
from line_messages o
group by name
order by 5 desc

然後用以下程式碼訓練訓練模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import jieba

# 連線 postgresql
engine = create_engine(
"postgresql+psycopg2://postgres:postgres@localhost:5432/trashtalk"
)

# 查出內容
data = pd.read_sql_query("select * from line_messages",engine)

# 1. Label 編碼
le = LabelEncoder()
data["label"] = le.fit_transform(data["label"])

X = data["message"]
y = data["label"]

# 2. 切分訓練/測試
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. 中文斷詞函數
def chinese_tokenizer(text):
return list(jieba.cut(text))

# 4. TfidfVectorizer 使用中文斷詞
text_feature_extraction = TfidfVectorizer(
tokenizer=chinese_tokenizer, max_features=5000, ngram_range=(1, 2)
)

# 5. 建立 pipeline(去掉 multi_class)
model = LogisticRegression(solver="lbfgs", max_iter=1000, class_weight='balanced')
pipeline = Pipeline([("tfidf", text_feature_extraction), ("model", model)])

# 6. 訓練模型
pipeline.fit(X_train, y_train)

# 7. 測試準確率
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# 8. 中文訊息預測函數(回傳文字標籤)
def trash_talk_prediction(message):
input_data = [message]
prediction = pipeline.predict(input_data)
label_name = le.inverse_transform(prediction) # 轉回文字標籤
return label_name[0]

測試訓練結果看看是否跟自己的 幹話 或是 正常 區分得出來, 果然幹話丟進去都是幹話, 讚!

1
2
3
4
5
print(trash_talk_prediction("建議導入送餐機器人"))
print(trash_talk_prediction("建議加上直排輪,移動上較為方便"))
print(trash_talk_prediction("沒這麼胖的機器人🤣🤣🤣"))
print(trash_talk_prediction("孔鏘老祖嗎?"))
print(trash_talk_prediction("杯麵來了~~"))

後來想搞個 fastapi 然後又噴奇怪的錯誤了, 發現不能把 chinese_tokenizer 直接在 jupyter 裡面訓練, 要額外抽一個模組, 不然會噴這個錯噴到死, 搞超久 - . -“

1
AttributeError: Can't get attribute 'chinese_tokenizer' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>

tokenizer_module.py

1
2
3
4
5
6
7
8
9
import jieba
from snownlp import SnowNLP

def chinese_tokenizer(text):
return list(jieba.cut(text))

# def chinese_tokenizer(text):
# s = SnowNLP(text)
# return list(s.words)

train.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sqlalchemy import create_engine

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from tokenizer_module import chinese_tokenizer

import joblib
from snownlp import SnowNLP

# 連線 postgresql
engine = create_engine(
"postgresql+psycopg2://postgres:postgres@localhost:5432/trashtalk"
)

# 查出內容
data = pd.read_sql_query("select * from line_messages",engine)

# 1. Label 編碼
le = LabelEncoder()
data["label"] = le.fit_transform(data["label"])

X = data["message"]
y = data["label"]

# 2. 切分訓練/測試
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)


# 4. TfidfVectorizer 使用中文斷詞
text_feature_extraction = TfidfVectorizer(
# tokenizer=chinese_tokenizer, max_features=5000, ngram_range=(1, 2)
tokenizer=chinese_tokenizer, max_features=5000, ngram_range=(1, 2)
)

# 5. 建立 pipeline(去掉 multi_class)
model = LogisticRegression(solver="lbfgs", max_iter=1000, class_weight='balanced')
pipeline = Pipeline([("tfidf", text_feature_extraction), ("model", model)])

# 6. 訓練模型
pipeline.fit(X_train, y_train)

# 7. 測試準確率
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



# 將整個 pipeline 儲存成 pkl
joblib.dump(pipeline, "trash_talk_pipeline.pkl")
joblib.dump(le, "label_encoder.pkl")

trashapi.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import jieba
from snownlp import SnowNLP
from tokenizer_module import chinese_tokenizer # 🔹 這行非常重要

# -------------------------------
# 載入模型
# -------------------------------
pipeline = joblib.load("trash_talk_pipeline.pkl")
le = joblib.load("label_encoder.pkl")

# -------------------------------
# FastAPI 初始化
# -------------------------------
app = FastAPI(title="Trash Talk Classifier API")

# -------------------------------
# 請求資料模型
# -------------------------------
class MessageRequest(BaseModel):
message: str

# -------------------------------
# 預測路由
# -------------------------------
@app.post("/predict")
def predict(request: MessageRequest):
text = [request.message]

# 預測數字 label
pred_num = pipeline.predict(text)

# 轉回文字 label
pred_label = le.inverse_transform(pred_num)[0]

# 預測機率
proba = pipeline.predict_proba(text)[0]

# 對應文字 label 的機率字典
scores = {cls: float(score) for cls, score in zip(le.classes_, proba)}

return {
"message": request.message,
"predicted_label": pred_label,
"scores": scores
}

post 後吐出內容

1
2
3
4
5
6
7
8
{
"message": "已經找很多鏟子了,而鏟子超人已經去嗨了!",
"predicted_label": "trash",
"scores": {
"normal": 0.17322229156229862,
"trash": 0.8267777084377014
}
}
關閉