본문 바로가기
  • Let's go grab a data
Data/Python

자연어 처리 영사전 만들어 비교하기

by pub-lican-ai 2018. 12. 12.
반응형

import nltk

import pandas as pd

from nltk.corpus import gutenberg

tokens = gutenberg.words()

tokens

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

사전에 없는 단어 선별


#pandas series로 변경하여 for문으로 사전과 tokens를 비교하는 작업을 대체하자

tokens_series = pd.Series(tokens)

isAlpha = tokens_series.str.isalpha()

words_series = tokens_series[isAlpha].str.lower()

words_drop_duplicate = words_series.drop_duplicates()

len(words_drop_duplicate)

41487

from nltk.corpus import words
words.fileids()
voca = words.words()words_drop_duplicate.isin(voca)[:10]
# not words_drop_duplicate.isin(voca)  #<-이건 객체를 T<->F 전환
~words_drop_duplicate.isin(voca)[:10]  #<-이건 각 요소 별 T<->F 전환
1     False
2     False
3     False
4      True
7     False
8     False
9     False
12    False
14    False
16    False

none = ~words_drop_duplicate.isin(voca)
words_drop_duplicate[none][:10]
4         austen
29        seemed
36     blessings
47         years
63      youngest
67     daughters
102         died
117     caresses
124     supplied
147       taylor
dtype: object

#bnc에서 단어원형 사전만들기
from nltk.corpus.reader.bnc import BNCCorpusReader
bnc = BNCCorpusReader(root='corpus/BNC/2554/download/Texts/',
                     fileids=r'[A-K]/.+/.+\.xml$')
len(bnc.fileids())
fids = bnc.fileids()[:2]
tokens_bnc = bnc.words(fileids=fids)
tokens_bnc_stems = bnc.words(fileids=fids, stem=True)
print(tokens_bnc[:10])
print(tokens_bnc_stems[:10])
['FACTSHEET', 'WHAT', 'IS', 'AIDS', '?', 'AIDS', '(', 'Acquired', 'Immune', 'Deficiency']
['factsheet', 'what', 'be', 'aids', '?', 'aids', '(', 'acquire', 'immune', 'deficiency']

#영어 단어원형 사전을 DataFrame로 만들기
words_table = pd.DataFrame([tokens_bnc, tokens_bnc_stems]).T
words_table.columns = ['word','stem']
print(words_table[:10])
         word        stem
0   FACTSHEET   factsheet
1        WHAT        what
2          IS          be
3        AIDS        aids
4           ?           ?
5        AIDS        aids
6           (           (
7    Acquired     acquire
8      Immune      immune
9  Deficiency  deficiency

#영어 단어원형 사전 정제
words_table['word'] = words_table['word'].str.lower()
isAlpha = words_table['word'].str.isalpha()
words_table = words_table[isAlpha]
words_table = words_table.drop_duplicates().sort_values('word')
print(words_table[:10])
             word        stem
13              a           a
1212   abbeylands  abbeylands
3964      ability     ability
842          able        able
741         about       about
12315       above       above
5027       abroad      abroad
512    absolutely  absolutely
9104       absorb      absorb
11943      accept      accept

#Series로 만든 다음, index에 단어를 넣고, 이를 단어변환사전으로 만듬
words_table_trans = words_table['stem']
words_table_trans.index = words_table['word']
words_table_trans_dict = words_table_trans.to_dict()
words_table_trans_dict
{'FACTSHEET': 'factsheet',
 'WHAT': 'what',
 'IS': 'be',
 'AIDS': 'aids',
 '?': '?',
 '(': '(',
 'Acquired': 'acquire',
 'Immune': 'immune',
 'Deficiency': 'deficiency',
 'Syndrome': 'syndrome', 
  ...

#단어표에 각각 value에 lambda 함수를 적용, .get 으로 사전에 word가 있으면 그 word넣고, 없으면 단어표 값 그대로 쓰기
words_stem = words_drop_duplicate.map(
                                lambda word: words_table_trans_dict.get(word,word))
len(words_stem[~words_stem.isin(words_table_trans_dict)])
39939
len(words_drop_duplicate[~words_drop_duplicate.isin(words_table_trans_dict)])
39770


반응형