자연어 처리 영사전 만들어 비교하기

import nltk

import pandas as pd

from nltk.corpus import gutenberg

tokens = gutenberg.words()

tokens

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

사전에 없는 단어 선별

#pandas series로 변경하여 for문으로 사전과 tokens를 비교하는 작업을 대체하자

tokens_series = pd.Series(tokens)

isAlpha = tokens_series.str.isalpha()

words_series = tokens_series[isAlpha].str.lower()

words_drop_duplicate = words_series.drop_duplicates()

len(words_drop_duplicate)

from nltk.corpus import words
words.fileids()
voca = words.words()words_drop_duplicate.isin(voca)[:10]
# not words_drop_duplicate.isin(voca)  #<-이건 객체를 T<->F 전환
~words_drop_duplicate.isin(voca)[:10]  #<-이건 각 요소 별 T<->F 전환

1     False
2     False
3     False
4      True
7     False
8     False
9     False
12    False
14    False
16    False

none = ~words_drop_duplicate.isin(voca)
words_drop_duplicate[none][:10]

4         austen
29        seemed
36     blessings
47         years
63      youngest
67     daughters
102         died
117     caresses
124     supplied
147       taylor
dtype: object

#bnc에서 단어원형 사전만들기
from nltk.corpus.reader.bnc import BNCCorpusReader
bnc = BNCCorpusReader(root='corpus/BNC/2554/download/Texts/',
                     fileids=r'[A-K]/.+/.+\.xml$')
len(bnc.fileids())
fids = bnc.fileids()[:2]
tokens_bnc = bnc.words(fileids=fids)
tokens_bnc_stems = bnc.words(fileids=fids, stem=True)
print(tokens_bnc[:10])
print(tokens_bnc_stems[:10])

['FACTSHEET', 'WHAT', 'IS', 'AIDS', '?', 'AIDS', '(', 'Acquired', 'Immune', 'Deficiency']
['factsheet', 'what', 'be', 'aids', '?', 'aids', '(', 'acquire', 'immune', 'deficiency']

#영어 단어원형 사전을 DataFrame로 만들기
words_table = pd.DataFrame([tokens_bnc, tokens_bnc_stems]).T
words_table.columns = ['word','stem']
print(words_table[:10])

         word        stem
0   FACTSHEET   factsheet
1        WHAT        what
2          IS          be
3        AIDS        aids
4           ?           ?
5        AIDS        aids
6           (           (
7    Acquired     acquire
8      Immune      immune
9  Deficiency  deficiency

#영어 단어원형 사전 정제
words_table['word'] = words_table['word'].str.lower()
isAlpha = words_table['word'].str.isalpha()
words_table = words_table[isAlpha]
words_table = words_table.drop_duplicates().sort_values('word')
print(words_table[:10])

             word        stem
13              a           a
1212   abbeylands  abbeylands
3964      ability     ability
842          able        able
741         about       about
12315       above       above
5027       abroad      abroad
512    absolutely  absolutely
9104       absorb      absorb
11943      accept      accept

#Series로 만든 다음, index에 단어를 넣고, 이를 단어변환사전으로 만듬
words_table_trans = words_table['stem']
words_table_trans.index = words_table['word']
words_table_trans_dict = words_table_trans.to_dict()
words_table_trans_dict

{'FACTSHEET': 'factsheet',
 'WHAT': 'what',
 'IS': 'be',
 'AIDS': 'aids',
 '?': '?',
 '(': '(',
 'Acquired': 'acquire',
 'Immune': 'immune',
 'Deficiency': 'deficiency',
 'Syndrome': 'syndrome',

...

#단어표에 각각 value에 lambda 함수를 적용, .get 으로 사전에 word가 있으면 그 word넣고, 없으면 단어표 값 그대로 쓰기
words_stem = words_drop_duplicate.map(
                                lambda word: words_table_trans_dict.get(word,word))

len(words_stem[~words_stem.isin(words_table_trans_dict)])

len(words_drop_duplicate[~words_drop_duplicate.isin(words_table_trans_dict)])

저작자표시 비영리 변경금지 (새창열림)

'Data > Python' 카테고리의 다른 글

Word Net 대응, synsets, synset, 거리측정 (0)	2018.12.12
자연어 처리 한글사전 만들어 비교하기 (0)	2018.12.12
한글 말뭉치 리더기 만들기 (세종) (0)	2018.12.11
사용자 정의 말뭉치 읽고 처리 (0)	2018.12.11
Corpus type - Categories, Tagged, UDHR (0)	2018.12.11

On the ball

자연어 처리 영사전 만들어 비교하기

사전에 없는 단어 선별

'Data > Python' 카테고리의 다른 글

티스토리툴바

자연어 처리 영사전 만들어 비교하기

사전에 없는 단어 선별

'Data > Python' 카테고리의 다른 글

관련글

티스토리툴바