本文实例讲述了python提取内容关键词的方法。分享给大家供大家参考。具体分析如下:
一个非常高效的提取内容关键词的python代码,这段代码只能用于英文文章内容,中文因为要分词,这段代码就无能为力了,不过要加上分词功能,效果和英文是一样的。
复制代码 代码如下:
# coding=UTF-8
import nltk
from nltk.corpus import brown
# This is a fast and simple noun phrase extractor (based on NLTK)
# Feel free to use it, just keep a link back to this post
# http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
# Create by Shlomi Babluki
# May, 2013
# This is our fast Part of Speech tagger
#############################################################################
brown_train = brown.tagged_sents(categories=\’news\’)
regexp_tagger = nltk.RegexpTagger(
[(r\’^-?[0-9]+(.[0-9]+)?$\’, \’CD\’),
(r\'(-|:|;)$\’, \’:\’),
(r\’\\\’*$\’, \’MD\’),
(r\'(The|the|A|a|An|an)$\’, \’AT\’),
(r\’.*able$\’, \’JJ\’),
(r\’^[A-Z].*$\’, \’NNP\’),
(r\’.*ness$\’, \’NN\’),
(r\’.*ly$\’, \’RB\’),
(r\’.*s$\’, \’NNS\’),
(r\’.*ing$\’, \’VBG\’),
(r\’.*ed$\’, \’VBD\’),
(r\’.*\’, \’NN\’)
])
unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger)
#############################################################################
# This is our semi-CFG; Extend it according to your own needs
#############################################################################
cfg = {}
cfg[\”NNP+NNP\”] = \”NNP\”
cfg[\”NN+NN\”] = \”NNI\”
cfg[\”NNI+NN\”] = \”NNI\”
cfg[\”JJ+JJ\”] = \”JJ\”
cfg[\”JJ+NN\”] = \”NNI\”
#############################################################################
class NPExtractor(object):
def __init__(self, sentence):
self.sentence = sentence
# Split the sentence into singlw words/tokens
def tokenize_sentence(self, sentence):
tokens = nltk.word_tokenize(sentence)
return tokens
# Normalize brown corpus\’ tags (\”NN\”, \”NN-PL\”, \”NNS\” > \”NN\”)
def normalize_tags(self, tagged):
n_tagged = []
for t in tagged:
if t[1] == \”NP-TL\” or t[1] == \”NP\”:
n_tagged.append((t[0], \”NNP\”))
continue
if t[1].endswith(\”-TL\”):
n_tagged.append((t[0], t[1][:-3]))
continue
if t[1].endswith(\”S\”):
n_tagged.append((t[0], t[1][:-1]))
continue
n_tagged.append((t[0], t[1]))
return n_tagged
# Extract the main topics from the sentence
def extract(self):
tokens = self.tokenize_sentence(self.sentence)
tags = self.normalize_tags(bigram_tagger.tag(tokens))
merge = True
while merge:
merge = False
for x in range(0, len(tags) – 1):
t1 = tags[x]
t2 = tags[x + 1]
key = \”%s+%s\” % (t1[1], t2[1])
value = cfg.get(key, \’\’)
if value:
merge = True
tags.pop(x)
tags.pop(x)
match = \”%s %s\” % (t1[0], t2[0])
pos = value
tags.insert(x, (match, pos))
break
matches = []
for t in tags:
if t[1] == \”NNP\” or t[1] == \”NNI\”:
#if t[1] == \”NNP\” or t[1] == \”NNI\” or t[1] == \”NN\”:
matches.append(t[0])
return matches
# Main method, just run \”python np_extractor.py\”
def main():
sentence = \”Swayy is a beautiful new dashboard for discovering and curating online content.\”
np_extractor = NPExtractor(sentence)
result = np_extractor.extract()
print \”This sentence is about: %s\” % \”, \”.join(result)
if __name__ == \’__main__\’:
main()
希望本文所述对大家的Python程序设计有所帮助。