본문 바로가기
  • Let's go grab a data
Data/Python

Corpus type - Categories, Tagged, UDHR

by pub-lican-ai 2018. 12. 11.
반응형

import pandas as pd

from nltk.corpus import brown

#CategorizedTaggedCorpusReader

print(brown)


brown.fileids()[:5]

brown.words()

brown.tagged_words()

brown.categories()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

brown.words(categories='news')
brown.fileids(categories='news')[:5]
['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
['ca01', 'ca02', 'ca03', 'ca04', 'ca05']

분류별 어휘 스타일 비교


news_tokens = brown.words(categories='news') fiction_tokens = brown.words(categories='fiction')
news_tokens_series = pd.Series(news_tokens) news_tokens_series[:5]
news_tokens_series.value_counts()[:10]
the    5580
,      5188
.      4030
of     2849
and    2146
to     2116
a      1993
in     1893
for     943
The     806
dtype: int64

단어 토큰만 선택


# Pandas.Series.Str <- Value가 문자열인 경우 각 문자열에 대해 .str로 접근 후 .메소드 적용 가능 news_tokens_series = news_tokens_series.str.lower() # 영문자만 선택됨 isAlpha = news_tokens_series.str.isalpha() news_words_series = news_tokens_series[isAlpha] print(news_words_series[:10]) print('\n') print(news_words_series.value_counts()[:10]) news_words_series_count = news_words_series.value_counts()
0 the
1           fulton
2           county
3            grand
4             jury
5             said
6           friday
7               an
8    investigation
9               of
dtype: object

the     6386
of      2861
and     2186
to      2144
a       2130
in      2020
for      969
that     829
is       733
was      717
dtype: int64



#위 단계와 동일

fiction_tokens_series = pd.Series(fiction_tokens)

fiction_tokens_series = fiction_tokens_series.str.lower()

isAlpha = fiction_tokens_series.str.isalpha()

fiction_words_series = fiction_tokens_series[isAlpha]

fiction_words_series.value_counts()[:10]

fiction_words_serise_count = fiction_words_series.value_counts()


#관심 단어

words = ['can', 'could', 'may']

#관심 단어 도수

print(news_words_series_count[words])

print(fiction_words_serise_count[words])

#관심 단어 비율

news_words_rate = news_words_series_count / news_words_series_count.sum()

fiction_words_rate = fiction_words_serise_count / fiction_words_serise_count.sum()

print(news_words_rate[words])

print(fiction_words_rate[words])

can      94
could    87
may      93
dtype: int64
can       39
could    168
may       10
dtype: int64
can      0.001125
could    0.001041
may      0.001113
dtype: float64
can      0.000683
could    0.002943
may      0.000175
dtype: float64

#동일 내용 함수화
def calWordRate(CorpusReader, cat):
    tokens = CorpusReader.words(categories = cat)
    tokens_series = pd.Series(tokens)    
    isAlpha = tokens_series.str.isalpha()
    words_series = tokens_series[isAlpha].str.lower()
    words_count = words_series.value_counts()
    words_rate = words_count / words_count.sum()
    return words_rate

news_words_rate = calWordRate(brown,'news')
news_words_rate[:10]
the     0.076422
of      0.034238
and     0.026160
to      0.025658
a       0.025490
in      0.024174
for     0.011596
that    0.009921
is      0.008772
was     0.008580
dtype: float64
words_rate_for_category = {}
for cat in brown.categories():
    words_rate = calWordRate(brown, cat)
    words_rate_for_category[cat] = words_rate


#print(words_rate_for_category)

pd.DataFrame(words_rate_for_category).T


aaaaaaaaawwwaahaaronababackabandonabandoned...zoooop 
adventure0.025274NaNNaN0.0000180.0000180.000035NaN0.0000180.0000180.000018... NaN
belles_lettres0.023154NaNNaNNaNNaN0.000007NaNNaN0.0000470.000027... NaN
editorial0.022041NaNNaNNaNNaNNaNNaNNaN0.0000190.000057... NaN
fiction0.023456NaNNaNNaNNaN0.000018NaNNaNNaN0.000018... NaN
government0.016194NaNNaNNaNNaNNaNNaNNaN0.000017NaN... NaN
hobbies0.026620NaN0.000014NaNNaNNaNNaNNaN0.0000140.000014... 0.000029
humor0.029422NaNNaNNaNNaNNaNNaNNaNNaNNaN... NaN
learned0.022052NaNNaNNaNNaN0.0000060.000013NaNNaN0.000006... NaN
lore0.025607NaNNaNNaNNaN0.000011NaNNaN0.0000210.000053... NaN
mystery0.025661NaNNaNNaNNaNNaNNaNNaN0.0000430.000043... NaN
news0.025490NaNNaNNaNNaN0.000012NaNNaNNaN0.000036... NaN
religion0.020437NaNNaNNaNNaNNaNNaNNaN0.0000590.000059... NaN
reviews0.027266NaNNaNNaNNaNNaNNaNNaNNaN0.000059... NaN
romance0.0243240.000018NaNNaNNaNNaNNaN0.000018NaNNaN... NaN
science_fiction0.020065NaNNaNNaNNaNNaN0.000085NaNNaNNaN... 



words_rate_for_category_table = pd.DataFrame(words_rate_for_category).T

words_rate_for_category_table[words].plot(kind='barh', stacked=True)



from nltk.corpus import udhr

udhr

#세계 인권 선언문 - 같은 의미의 내용이 서로다른 언어로 되어 있음-> 기계번역의 소스

udhr.fileids()[147:148]

fid_korean = udhr.fileids()[147]

print(udhr.raw(fid_korean)[:100])

세 계 인 권 선 언

전 문 

모든 인류 구성원의 천부의 존엄성과 동등하고 양도할 수 없는 권리를 인정하는 것이 세계의 자유 , 정의 및 평화의 기초이며 , 

인권에 대한 무


반응형