반응형
import nltk
import pandas as pd
from nltk.corpus import gutenberg
tokens = gutenberg.words()
tokens
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]
사전에 없는 단어 선별
#pandas series로 변경하여 for문으로 사전과 tokens를 비교하는 작업을 대체하자
tokens_series = pd.Series(tokens)
isAlpha = tokens_series.str.isalpha()
words_series = tokens_series[isAlpha].str.lower()
words_drop_duplicate = words_series.drop_duplicates()
len(words_drop_duplicate)
41487
from nltk.corpus import words
words.fileids()
voca = words.words()words_drop_duplicate.isin(voca)[:10]
# not words_drop_duplicate.isin(voca) #<-이건 객체를 T<->F 전환
~words_drop_duplicate.isin(voca)[:10] #<-이건 각 요소 별 T<->F 전환
1 False
2 False
3 False
4 True
7 False
8 False
9 False
12 False
14 False
16 False
none = ~words_drop_duplicate.isin(voca)
words_drop_duplicate[none][:10]
4 austen 29 seemed 36 blessings 47 years 63 youngest 67 daughters 102 died 117 caresses 124 supplied 147 taylor dtype: object
#bnc에서 단어원형 사전만들기 from nltk.corpus.reader.bnc import BNCCorpusReader bnc = BNCCorpusReader(root='corpus/BNC/2554/download/Texts/', fileids=r'[A-K]/.+/.+\.xml$') len(bnc.fileids()) fids = bnc.fileids()[:2]
tokens_bnc = bnc.words(fileids=fids) tokens_bnc_stems = bnc.words(fileids=fids, stem=True) print(tokens_bnc[:10]) print(tokens_bnc_stems[:10])
['FACTSHEET', 'WHAT', 'IS', 'AIDS', '?', 'AIDS', '(', 'Acquired', 'Immune', 'Deficiency']
['factsheet', 'what', 'be', 'aids', '?', 'aids', '(', 'acquire', 'immune', 'deficiency']
#영어 단어원형 사전을 DataFrame로 만들기
words_table = pd.DataFrame([tokens_bnc, tokens_bnc_stems]).T
words_table.columns = ['word','stem']
print(words_table[:10])
word stem
0 FACTSHEET factsheet
1 WHAT what
2 IS be
3 AIDS aids
4 ? ?
5 AIDS aids
6 ( (
7 Acquired acquire
8 Immune immune
9 Deficiency deficiency
#영어 단어원형 사전 정제
words_table['word'] = words_table['word'].str.lower()
isAlpha = words_table['word'].str.isalpha()
words_table = words_table[isAlpha]
words_table = words_table.drop_duplicates().sort_values('word')
print(words_table[:10])
word stem
13 a a
1212 abbeylands abbeylands
3964 ability ability
842 able able
741 about about
12315 above above
5027 abroad abroad
512 absolutely absolutely
9104 absorb absorb
11943 accept accept
#Series로 만든 다음, index에 단어를 넣고, 이를 단어변환사전으로 만듬
words_table_trans = words_table['stem']
words_table_trans.index = words_table['word']
words_table_trans_dict = words_table_trans.to_dict()
words_table_trans_dict
{'FACTSHEET': 'factsheet',
'WHAT': 'what',
'IS': 'be',
'AIDS': 'aids',
'?': '?',
'(': '(',
'Acquired': 'acquire',
'Immune': 'immune',
'Deficiency': 'deficiency',
'Syndrome': 'syndrome',
...
#단어표에 각각 value에 lambda 함수를 적용, .get 으로 사전에 word가 있으면 그 word넣고, 없으면 단어표 값 그대로 쓰기
words_stem = words_drop_duplicate.map(
lambda word: words_table_trans_dict.get(word,word))
len(words_stem[~words_stem.isin(words_table_trans_dict)])
39939
len(words_drop_duplicate[~words_drop_duplicate.isin(words_table_trans_dict)])
39770
반응형
'Data > Python' 카테고리의 다른 글
Word Net 대응, synsets, synset, 거리측정 (0) | 2018.12.12 |
---|---|
자연어 처리 한글사전 만들어 비교하기 (0) | 2018.12.12 |
한글 말뭉치 리더기 만들기 (세종) (0) | 2018.12.11 |
사용자 정의 말뭉치 읽고 처리 (0) | 2018.12.11 |
Corpus type - Categories, Tagged, UDHR (0) | 2018.12.11 |