반응형
import nltk
from nltk.corpus import gutenberg
#Plaintext
gutenberg
print(gutenberg.fileids()[:10])
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt']
fid_1 = gutenberg.fileids()[0]
raw_text = gutenberg.raw(fid_1)
print(type(raw_text))
print(raw_text[:100])
print(len(raw_text))
<class 'str'>
[Emma by Jane Austen 1816]
VOLUME I
CHAPTER I
Emma Woodhouse, handsome, clever, and rich, with a
887071
tokens = gutenberg.words(fid_1)
print(tokens[:10])
print(len(tokens))
print(len(raw_text)/len(tokens))
['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']
192427
4.609909212324673
from nltk.corpus import nps_chat #XML 형태의 Corpus Reader NPSChatCorpus nps_chat
nps_chat.fileids() fid = nps_chat.fileids()[0] fid raw_text = nps_chat.raw(fid) print(raw_text[:412])
<!-- edited with XMLSpy v2007 sp1 (http://www.altova.com) by Eric Forsyth (Naval Postgraduate School) -->
<Session xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="postClassPOSTagset.xsd">
<Posts>
<Post class="Statement" user="10-19-20sUser7">now im left with this gay name<terminals>
<t pos="RB" word="now"/>
<t pos="PRP" word="im"/>
<t pos="VBD" word="left"/>
#XML을 이해하고 word에 해당하는 정보만 파싱해서 옴
tokens = nps_chat.words(fid)
print(tokens[:10])
#tagged_words 통해 pos정보도 포함해서 튜플로 획득
tagged_tokens = nps_chat.tagged_words(fid)
print(tagged_tokens[:10])
['now', 'im', 'left', 'with', 'this', 'gay', 'name', ':P', 'PART', 'hey']
[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN'), (':P', 'UH'), ('PART', 'VB'), ('hey', 'UH')]
말뭉치 분석
- N으로 시작하는 명사만 선택
- 각 품사별 도수 분석, 도수 높은 품사순으로 정리
nouns = []
#Unpacking
for token, tag in tagged_tokens:
if tag.startswith('N'):
nouns.append(token)
print(nouns[:20])
['name', 'everyone', 'NICK', 'U7', 'U7', 'name', 'ACTION', 'U121', 'golf', 'clap', 'U59', 'm', 'ky', 'women', 'U7', 'golf', 'U121', 'everyone', 'thunder', 'ass']
nouns_distinct = []
for token, tag in tagged_tokens:
if tag.startswith('N') and token not in nouns_distinct:
nouns_distinct.append(token)
print(nouns_distinct[:20])
['name', 'everyone', 'NICK', 'U7', 'ACTION', 'U121', 'golf', 'clap', 'U59', 'm', 'ky', 'women', 'thunder', 'ass', 'cousin', 'pic', 'cast', 'U115', 'girl', 'legs']
def selectToken(tagFilter):
nouns_distinct = []
for token, tag in tagged_tokens:
token = token.lower()
if tag.startswith(tagFilter) and token not in nouns_distinct:
nouns_distinct.append(token)
return nouns_distinct
nouns = selectToken('N')
print(nouns[:20])
['name', 'everyone', 'nick', 'u7', 'action', 'u121', 'golf', 'clap', 'u59', 'm', 'ky', 'women', 'thunder', 'ass', 'cousin', 'pic', 'cast', 'u115', 'girl', 'legs']
#품사빈도수
numbersOfPos = {}
for token, tag in tagged_tokens:
if not tag in numbersOfPos:
numbersOfPos[tag] = 1
else:
numbersOfPos[tag] +=1
{'RB': 123, 'PRP': 260, 'VBD': 49, 'IN': 105, 'DT': 130, 'JJ': 107, 'NN': 279, 'UH': 255, 'VB': 274, ':': 70, 'NNP': 248, 'VBZ': 94, '.': 206, 'SYM': 49, 'CD': 27, 'CC': 39, 'NNS': 74, 'WDT': 4, 'VBP': 113, 'WP': 24, 'RP': 20, 'TO': 41, 'MD': 28, 'PRP$': 50, '^NNS': 2, ',': 30, 'VBG': 26, "''": 16, '^NN': 3, 'POS': 10, 'EX': 4, 'BES': 10, 'WRB': 17, '^VB': 5, '^RB': 1, 'VBN': 8, 'PDT': 1, '^VBZ': 2, 'RBR': 3, 'RBS': 1, '^VBP': 2, '(': 4, 'NNPS': 1, ')': 5, '^VBG': 1, 'JJR': 3, 'JJS': 1, '^PRP': 2, '^JJ': 1, '^NNP': 1}
numbersOfPos_list = list(numbersOfPos.items()) print(numbersOfPos_list) print('\n') numbersOfPos_list.sort(key = lambda List : List[1], reverse=True)
print(numbersOfPos_list)
[('RB', 123), ('PRP', 260), ('VBD', 49), ('IN', 105), ('DT', 130), ('JJ', 107), ('NN', 279), ('UH', 255), ('VB', 274), (':', 70), ('NNP', 248), ('VBZ', 94), ('.', 206), ('SYM', 49), ('CD', 27), ('CC', 39), ('NNS', 74), ('WDT', 4), ('VBP', 113), ('WP', 24), ('RP', 20), ('TO', 41), ('MD', 28), ('PRP$', 50), ('^NNS', 2), (',', 30), ('VBG', 26), ("''", 16), ('^NN', 3), ('POS', 10), ('EX', 4), ('BES', 10), ('WRB', 17), ('^VB', 5), ('^RB', 1), ('VBN', 8), ('PDT', 1), ('^VBZ', 2), ('RBR', 3), ('RBS', 1), ('^VBP', 2), ('(', 4), ('NNPS', 1), (')', 5), ('^VBG', 1), ('JJR', 3), ('JJS', 1), ('^PRP', 2), ('^JJ', 1), ('^NNP', 1)]
[('NN', 279), ('VB', 274), ('PRP', 260), ('UH', 255), ('NNP', 248), ('.', 206), ('DT', 130), ('RB', 123), ('VBP', 113), ('JJ', 107), ('IN', 105), ('VBZ', 94), ('NNS', 74), (':', 70), ('PRP$', 50), ('VBD', 49), ('SYM', 49), ('TO', 41), ('CC', 39), (',', 30), ('MD', 28), ('CD', 27), ('VBG', 26), ('WP', 24), ('RP', 20), ('WRB', 17), ("''", 16), ('POS', 10), ('BES', 10), ('VBN', 8), ('^VB', 5), (')', 5), ('WDT', 4), ('EX', 4), ('(', 4), ('^NN', 3), ('RBR', 3), ('JJR', 3), ('^NNS', 2), ('^VBZ', 2), ('^VBP', 2), ('^PRP', 2), ('^RB', 1), ('PDT', 1), ('RBS', 1), ('NNPS', 1), ('^VBG', 1), ('JJS', 1), ('^JJ', 1), ('^NNP', 1)]
pandas DataFrame, Series
df = pd.DataFrame((token, tag) for token, tag in tagged_tokens) df.columns = ['토큰', '태그'] print(df[:10])
토큰 태그
0 now RB
1 im PRP
2 left VBD
3 with IN
4 this DT
5 gay JJ
6 name NN
7 :P UH
8 PART VB
9 hey UH
#파이썬으로 각 리스트요소에 2곱하기
nums = list(range(10))
nums*2
[n*2 for n in nums]
#pandas의 Series 원소 별로 연산자 적용이 편함. 인덱스가 눈에 보임, 도수집계 value_counts, drop_duplicated 등 활용 가능
#1차원 데이터는 Series 2차원이상 데이터는 Data Frame 활용
num_series = pd.Series(nums)
print(num_series[:3])
print((num_series*2)[:3])
0 0
1 1
2 2
dtype: int64
0 0
1 2
2 4
dtype: int64
name_series = pd.Series([1,2,3,4,5])
print(name_series)
0 1
1 2
2 3
3 4
4 5
dtype: int64
profile = [{'name':'abc', 'sex':'male'},
{'name':'def', 'sex':'female'},
{'name':'ghi', 'sex':'male'}]
profile
df_profile = pd.DataFrame(profile)
print(df_profile)
#DataFrame의 한 열을 선택 시 Series로 반환
print(type(df_profile['sex']))
print('\n')
print(df_profile['sex'].value_counts())
name sex
0 abc male
1 def female
2 ghi male
<class 'pandas.core.series.Series'>
male 2
female 1
Name: sex, dtype: int64
#DataFrame에서 열을 선택하여 도수집계후 소팅까지됨
print(df['태그'].value_counts()[:10])
NN 279
VB 274
PRP 260
UH 255
NNP 248
. 206
DT 130
RB 123
VBP 113
JJ 107
Name: 태그, dtype: int64
반응형
'Data > Python' 카테고리의 다른 글
사용자 정의 말뭉치 읽고 처리 (0) | 2018.12.11 |
---|---|
Corpus type - Categories, Tagged, UDHR (0) | 2018.12.11 |
파이썬 자연어 처리 기초(NLTK) (0) | 2018.12.10 |
Word, pdf 문서에서 문자열 추출하기, 파일 입출력, 인코딩 (0) | 2018.12.10 |
Scrapy 크롤러 기본 (0) | 2018.12.06 |