pre_file.py
#-*-coding:utf-8-*- import MySQLdb import MySQLdb as mdb import os,sys,string import jieba import codecs reload(sys) sys.setdefaultencoding(\'utf-8\') #连接数据库 try: conn=mdb.connect(host=\'127.0.0.1\',user=\'root\',passwd=\'kongjunli\',db=\'test1\',charset=\'utf8\') except Exception,e: print e sys.exit() #获取cursor对象操作数据库 cursor=conn.cursor(mdb.cursors.DictCursor) #cursor游标 #获取内容 sql=\'SELECT link,content FROM test1.spider;\' cursor.execute(sql) #execute()方法,将字符串当命令执行 data=cursor.fetchall()#fetchall()接收全部返回结果行 f=codecs.open(\'C:\\Users\\kk\\Desktop\\hello-result1.txt\',\'w\',\'utf-8\') for row in data: #row接收结果行的每行数据 seg=\'/\'.join(list(jieba.cut(row[\'content\'],cut_all=\'False\'))) f.write(row[\'link\']+\' \'+seg+\'\\r\\n\') f.close() cursor.close() #提交事务,在插入数据时必须
jiansuo.py
#-*-coding:utf-8-*- import sys import string import MySQLdb import MySQLdb as mdb import gensim from gensim import corpora,models,similarities from gensim.similarities import MatrixSimilarity import logging import codecs reload(sys) sys.setdefaultencoding(\'utf-8\') con=mdb.connect(host=\'127.0.0.1\',user=\'root\',passwd=\'kongjunli\',db=\'test1\',charset=\'utf8\') with con: cur=con.cursor() cur.execute(\'SELECT * FROM cutresult_copy\') rows=cur.fetchall() class MyCorpus(object): def __iter__(self): for row in rows: yield str(row[1]).split(\'/\') #开启日志 logging.basicConfig(format=\'%(asctime)s:%(levelname)s:%(message)s\',level=logging.INFO) Corp=MyCorpus() #将网页文档转化为tf-idf dictionary=corpora.Dictionary(Corp) corpus=[dictionary.doc2bow(text) for text in Corp] #将文档转化为词袋模型 #print corpus tfidf=models.TfidfModel(corpus)#使用tf-idf模型得出文档的tf-idf模型 corpus_tfidf=tfidf[corpus]#计算得出tf-idf值 #for doc in corpus_tfidf: #print doc ### \'\'\' q_file=open(\'C:\\Users\\kk\\Desktop\\q.txt\',\'r\') query=q_file.readline() q_file.close() vec_bow=dictionary.doc2bow(query.split(\' \'))#将请求转化为词带模型 vec_tfidf=tfidf[vec_bow]#计算出请求的tf-idf值 #for t in vec_tfidf: # print t \'\'\' ### query=raw_input(\'Enter your query:\') vec_bow=dictionary.doc2bow(query.split()) vec_tfidf=tfidf[vec_bow] index=similarities.MatrixSimilarity(corpus_tfidf) sims=index[vec_tfidf] similarity=list(sims) print sorted(similarity,reverse=True)
encodings.xml
misc.xml
modules.xml