import nltk from nltk.corpus import wordnet as wn from nltk.corpus import sentiwordnet as swn
sys.setdefaultencoding('utf8')
defdoSeg(filename) : f = open(filename, 'r+') file_list = f.read() f.close()
seg_list = jieba.cut(file_list)
stopwords = [] for word inopen("./stop_words.txt", "r"): stopwords.append(word.strip())
ll = [] for seg in seg_list : if (seg.encode("utf-8") notin stopwords and seg != ' 'and seg != ''and seg != "\n"and seg != "\n\n"): ll.append(seg) return ll
defloadWordNet(): f = codecs.open("./cow-not-full.txt", "rb", "utf-8") known = set() for l in f: if l.startswith('#') ornot l.strip(): continue row = l.strip().split("\t") iflen(row) == 3: (synset, lemma, status) = row eliflen(row) == 2: (synset, lemma) = row status = 'Y' else: print"illformed line: ", l.strip() if status in ['Y', 'O' ]: ifnot (synset.strip(), lemma.strip()) in known: known.add((synset.strip(), lemma.strip())) return known
deffindWordNet(known, key): ll = []; for kk in known: if (kk[1] == key): ll.append(kk[0]) return ll
if __name__ == '__main__' : known = loadWordNet() words = doSeg(sys.argv[1])
n = 0 p = 0 for word in words: ll = findWordNet(known, word) if (len(ll) != 0): n1 = 0.0 p1 = 0.0 for wid in ll: desc = id2ss(wid) swninfo = getSenti(desc) p1 = p1 + swninfo.pos_score() n1 = n1 + swninfo.neg_score() if (p1 != 0.0or n1 != 0.0): print word, '-> n ', (n1 / len(ll)), ", p ", (p1 / len(ll)) p = p + p1 / len(ll) n = n + n1 / len(ll) print"n", n, ", p", p
3. 待解决的问题
结巴分词与 wordnet chinese 中的词不能一一对应
结巴分词虽然可以导入自定义的词典,但仍有些结巴分出的词,在 wordnet
找不到对应词义,比如"太后","童子",还有一些组合词如"很早已前","黄山"等等.大多是名词,需要进一步"学习".