python文本處理的方案(結巴分詞并去除符號)
更新時間:2021年05月26日 11:03:58 作者:依我去
這篇文章主要介紹了python文本處理的方案(結巴分詞并去除符號),具有很好的參考價值,希望對大家有所幫助。如有錯誤或未考慮完全的地方,望不吝賜教
看代碼吧~
import re
import jieba.analyse
import codecs
import pandas as pd
def simplification_text(xianbingshi):
"""提取文本"""
xianbingshi_simplification = []
with codecs.open(xianbingshi,'r','utf8') as f:
for line in f :
line = line.strip()
line_write = re.findall('(?<=\<b\>).*?(?=\<e\>)',line)
for line in line_write:
xianbingshi_simplification.append(line)
with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write.txt','w','utf8') as f:
for line in xianbingshi_simplification:
f.write(line + '\n')
def jieba_text():
""""""
word_list = []
data = open(r"C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\xianbingshi_write.txt", encoding='utf-8').read()
seg_list = jieba.cut(data, cut_all=False) # 精確模式
for i in seg_list:
word_list.append(i.strip())
data_quchong = pd.DataFrame({'a':word_list})
data_quchong.drop_duplicates(subset=['a'],keep='first',inplace=True)
word_list = data_quchong['a'].tolist()
with codecs.open('word.txt','w','utf8')as w:
for line in word_list:
w.write(line + '\n')
def word_messy(word):
"""詞語提煉"""
word_sub_list = []
with codecs.open(word,'r','utf8') as f:
for line in f:
line_sub = re.sub("^[1-9]\d*\.\d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?\d+)(\.\d+)?$|^[A-Za-z0-9]{4,40}.*?",'',line)
word_sub_list.append(line_sub)
word_sub_list.sort()
with codecs.open('word.txt','w','utf8')as w:
for line in word_sub_list:
w.write(line.strip("\n") + '\n')
if __name__ == '__main__':
xianbingshi = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\xianbingshi_sub_sen_all(1).txt'
# simplification_text(xianbingshi)
# word = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\word.txt'
simplification_text(xianbingshi)
補充:python 進行結巴分詞 并且用re去掉符號
看代碼吧~
# 把停用詞做成字典
stopwords = {}
fstop = open('stop_words.txt', 'r',encoding='utf-8',errors='ingnore')
for eachWord in fstop:
stopwords[eachWord.strip()] = eachWord.strip() #停用詞典
fstop.close()
f1=open('all.txt','r',encoding='utf-8',errors='ignore')
f2=open('allutf11.txt','w',encoding='utf-8')
line=f1.readline()
while line:
line = line.strip() #去前后的空格
line = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ", line) #去標點符號
seg_list=jieba.cut(line,cut_all=False) #結巴分詞
outStr=""
for word in seg_list:
if word not in stopwords:
outStr+=word
outStr+=" "
f2.write(outStr)
line=f1.readline()
f1.close()
f2.close()

以上為個人經驗,希望能給大家一個參考,也希望大家多多支持腳本之家。
您可能感興趣的文章:
相關文章
wx.CheckBox創(chuàng)建復選框控件并響應鼠標點擊事件
這篇文章主要為大家詳細介紹了wx.CheckBox創(chuàng)建復選框控件并響應鼠標點擊事件,具有一定的參考價值,感興趣的小伙伴們可以參考一下2018-04-04
在pycharm中關掉ipython console/PyDev操作
這篇文章主要介紹了在pycharm中關掉ipython console/PyDev操作,具有很好的參考價值,希望對大家有所幫助。一起跟隨小編過來看看吧2020-06-06

