#加入關鍵字 for str in get_keyword(filename): #print(str) keyword_processor.add_keyword(str)
#分詞 #換成 get_from_sql 則切成正式資料 for str in get_all(): #print(str) seg_list = jieba.cut(str,use_paddle=True) result = keyword_processor.extract_keywords('/'.join(list(seg_list))) #print(seg_list) #print(result) if len(result) > 0: print(result)
def get_keyword(filename): lines = [] with open(filename,"r",encoding="utf-8") as f: for line in f: #消除 UTF8-BOM 還有換行符號 #https://blog.csdn.net/qq_38939991/article/details/116103252 lines.append(line.lstrip("\ufeff").rstrip('\n')) return lines
def get_all(): result = [ "這個賣家真的很機車", "這個客戶真的很棒", "好棒棒", "GY 產品難用又垃圾", "我剛買回來馬上就壞了 GG", "舒服 ~", "無聊留言看看", "工程師測試" ] return result
def get_from_sql(): #這裡跟 sql server 的定序有關 , 要找到跟你 DB 一樣的定序 , 繁體中文為 cp950 #SELECT COLLATIONPROPERTY('Chinese_Taiwan_Stroke_CI_AS', 'CodePage') result = [] conn = pymssql.connect(server='123.456.78', user='1234', password='12345678', database='GGDB', charset='cp950')
cursor = conn.cursor(as_dict=True) cursor.execute( u""" select Comment from Board where Comment is not null and ( Comment like '%機車%' or Comment like '%wtf%' or Comment like '%GY%' or Comment like '%GG%' or Comment like '%FQ%' or Comment like '%好棒%' or Comment like '%好棒棒%' or Comment like '%很棒%' ) order by Comment """.encode('cp950') ) comments = cursor.fetchall() for row in comments: result.append(row['Comment']) #print(row['Comment']) conn.close() return result