본문 바로가기
  • On the ball
Data/Python

말뭉치Corpus 다루기, Pandas 기초

by pub-lican-ai 2018. 12. 11.
반응형

import nltk

from nltk.corpus import gutenberg

#Plaintext

gutenberg

print(gutenberg.fileids()[:10])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt']

fid_1 = gutenberg.fileids()[0]
raw_text = gutenberg.raw(fid_1)
print(type(raw_text))
print(raw_text[:100])
print(len(raw_text))
<class 'str'>
[Emma by Jane Austen 1816]
VOLUME I
CHAPTER I
Emma Woodhouse, handsome, clever, and rich, with a
887071


tokens = gutenberg.words(fid_1)

print(tokens[:10])

print(len(tokens))

print(len(raw_text)/len(tokens))

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']
192427
4.609909212324673

from nltk.corpus import nps_chat #XML 형태의 Corpus Reader NPSChatCorpus nps_chat

nps_chat.fileids() fid = nps_chat.fileids()[0] fid raw_text = nps_chat.raw(fid) print(raw_text[:412])

<!-- edited with XMLSpy v2007 sp1 (http://www.altova.com) by Eric Forsyth (Naval Postgraduate School) -->
<Session xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="postClassPOSTagset.xsd">
	<Posts>
		<Post class="Statement" user="10-19-20sUser7">now im left with this gay name<terminals>
				<t pos="RB" word="now"/>
				<t pos="PRP" word="im"/>
				<t pos="VBD" word="left"/>

#XML을 이해하고 word에 해당하는 정보만 파싱해서 옴
tokens = nps_chat.words(fid)
print(tokens[:10])
#tagged_words 통해 pos정보도 포함해서 튜플로 획득
tagged_tokens = nps_chat.tagged_words(fid)
print(tagged_tokens[:10])
['now', 'im', 'left', 'with', 'this', 'gay', 'name', ':P', 'PART', 'hey']
[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN'), (':P', 'UH'), ('PART', 'VB'), ('hey', 'UH')]

말뭉치 분석

  1. N으로 시작하는 명사만 선택
  2. 각 품사별 도수 분석, 도수 높은 품사순으로 정리


nouns = []

#Unpacking

for token, tag in tagged_tokens:

    if tag.startswith('N'):

        nouns.append(token)

print(nouns[:20])

['name', 'everyone', 'NICK', 'U7', 'U7', 'name', 'ACTION', 'U121', 'golf', 'clap', 'U59', 'm', 'ky', 'women', 'U7', 'golf', 'U121', 'everyone', 'thunder', 'ass']

nouns_distinct = []
for token, tag in tagged_tokens:
    if tag.startswith('N') and token not in nouns_distinct:
        nouns_distinct.append(token)
print(nouns_distinct[:20])
['name', 'everyone', 'NICK', 'U7', 'ACTION', 'U121', 'golf', 'clap', 'U59', 'm', 'ky', 'women', 'thunder', 'ass', 'cousin', 'pic', 'cast', 'U115', 'girl', 'legs']

def selectToken(tagFilter):
    nouns_distinct = []
    for token, tag in tagged_tokens:
        token = token.lower()
        if tag.startswith(tagFilter) and token not in nouns_distinct:
            nouns_distinct.append(token)
    return nouns_distinct

nouns = selectToken('N')

print(nouns[:20])

['name', 'everyone', 'nick', 'u7', 'action', 'u121', 'golf', 'clap', 'u59', 'm', 'ky', 'women', 'thunder', 'ass', 'cousin', 'pic', 'cast', 'u115', 'girl', 'legs']

#품사빈도수
numbersOfPos = {}
for token, tag in tagged_tokens:
    if not tag in numbersOfPos:
        numbersOfPos[tag] = 1
    else:
        numbersOfPos[tag] +=1
{'RB': 123, 'PRP': 260, 'VBD': 49, 'IN': 105, 'DT': 130, 'JJ': 107, 'NN': 279, 'UH': 255, 'VB': 274, ':': 70, 'NNP': 248, 'VBZ': 94, '.': 206, 'SYM': 49, 'CD': 27, 'CC': 39, 'NNS': 74, 'WDT': 4, 'VBP': 113, 'WP': 24, 'RP': 20, 'TO': 41, 'MD': 28, 'PRP$': 50, '^NNS': 2, ',': 30, 'VBG': 26, "''": 16, '^NN': 3, 'POS': 10, 'EX': 4, 'BES': 10, 'WRB': 17, '^VB': 5, '^RB': 1, 'VBN': 8, 'PDT': 1, '^VBZ': 2, 'RBR': 3, 'RBS': 1, '^VBP': 2, '(': 4, 'NNPS': 1, ')': 5, '^VBG': 1, 'JJR': 3, 'JJS': 1, '^PRP': 2, '^JJ': 1, '^NNP': 1}

numbersOfPos_list = list(numbersOfPos.items()) print(numbersOfPos_list) print('\n') numbersOfPos_list.sort(key = lambda List : List[1], reverse=True)

print(numbersOfPos_list)

[('RB', 123), ('PRP', 260), ('VBD', 49), ('IN', 105), ('DT', 130), ('JJ', 107), ('NN', 279), ('UH', 255), ('VB', 274), (':', 70), ('NNP', 248), ('VBZ', 94), ('.', 206), ('SYM', 49), ('CD', 27), ('CC', 39), ('NNS', 74), ('WDT', 4), ('VBP', 113), ('WP', 24), ('RP', 20), ('TO', 41), ('MD', 28), ('PRP$', 50), ('^NNS', 2), (',', 30), ('VBG', 26), ("''", 16), ('^NN', 3), ('POS', 10), ('EX', 4), ('BES', 10), ('WRB', 17), ('^VB', 5), ('^RB', 1), ('VBN', 8), ('PDT', 1), ('^VBZ', 2), ('RBR', 3), ('RBS', 1), ('^VBP', 2), ('(', 4), ('NNPS', 1), (')', 5), ('^VBG', 1), ('JJR', 3), ('JJS', 1), ('^PRP', 2), ('^JJ', 1), ('^NNP', 1)]


[('NN', 279), ('VB', 274), ('PRP', 260), ('UH', 255), ('NNP', 248), ('.', 206), ('DT', 130), ('RB', 123), ('VBP', 113), ('JJ', 107), ('IN', 105), ('VBZ', 94), ('NNS', 74), (':', 70), ('PRP$', 50), ('VBD', 49), ('SYM', 49), ('TO', 41), ('CC', 39), (',', 30), ('MD', 28), ('CD', 27), ('VBG', 26), ('WP', 24), ('RP', 20), ('WRB', 17), ("''", 16), ('POS', 10), ('BES', 10), ('VBN', 8), ('^VB', 5), (')', 5), ('WDT', 4), ('EX', 4), ('(', 4), ('^NN', 3), ('RBR', 3), ('JJR', 3), ('^NNS', 2), ('^VBZ', 2), ('^VBP', 2), ('^PRP', 2), ('^RB', 1), ('PDT', 1), ('RBS', 1), ('NNPS', 1), ('^VBG', 1), ('JJS', 1), ('^JJ', 1), ('^NNP', 1)]

pandas DataFrame, Series


df = pd.DataFrame((token, tag) for token, tag in tagged_tokens) df.columns = ['토큰', '태그'] print(df[:10])
    토큰   태그
0   now   RB
1    im  PRP
2  left  VBD
3  with   IN
4  this   DT
5   gay   JJ
6  name   NN
7    :P   UH
8  PART   VB
9   hey   UH

#파이썬으로 각 리스트요소에 2곱하기
nums = list(range(10))
nums*2
[n*2 for n in nums]

#pandas의 Series 원소 별로 연산자 적용이 편함. 인덱스가 눈에 보임, 도수집계 value_counts, drop_duplicated 등 활용 가능
#1차원 데이터는 Series 2차원이상 데이터는 Data Frame 활용
num_series = pd.Series(nums)
print(num_series[:3])
print((num_series*2)[:3])
0    0
1    1
2    2
dtype: int64
0    0
1    2
2    4
dtype: int64

name_series = pd.Series([1,2,3,4,5])
print(name_series)
0    1
1    2
2    3
3    4
4    5
dtype: int64

profile = [{'name':'abc', 'sex':'male'},
          {'name':'def', 'sex':'female'},
          {'name':'ghi', 'sex':'male'}]
profile
df_profile = pd.DataFrame(profile)
print(df_profile)
#DataFrame의 한 열을 선택 시 Series로 반환
print(type(df_profile['sex']))
print('\n')
print(df_profile['sex'].value_counts())
  name     sex
0  abc    male
1  def  female
2  ghi    male
<class 'pandas.core.series.Series'>


male      2
female    1
Name: sex, dtype: int64

#DataFrame에서 열을 선택하여 도수집계후 소팅까지됨
print(df['태그'].value_counts()[:10])
NN     279
VB     274
PRP    260
UH     255
NNP    248
.      206
DT     130
RB     123
VBP    113
JJ     107
Name: 태그, dtype: int64


반응형