[Python] - 워드 클라우드 사용하기

Python

[Python] - 워드 클라우드 사용하기

nam_ji 2024. 9. 25. 19:05

워드 클라우드 설명 및 사용 예제

워드 클라우드란?

워드클라우드는 분석에서 기본적으로 제공하는 텍스트 분석 방식입니다. 전체 응답을 키워드 단위로 분리한 뒤 많이 응답한 순서대로 크기와 컬러로 시각화합니다.

워드 클라우드 사용 전

필요한 패키지
- import matplotlib.pyplot as plt
- from wordcloud import WordCloud
- from PIL import Image
- import numpy as np

사용해보기

저는 전에 해봤던 가장 많이 나온 단어 찾기 예제를 이용해서 워드 클라우드를 사용해 보겠습니다.

기본 세팅
워드 클라우드 모듈을 이용하여 폰트 디자인, 배경 색상, 이미지 모양, 가져온 문장들을 설정해줍니다.

wc = WordCloud(
	font_path="C:/WorkSpace/Python/LaundryGothic_OTF/런드리고딕OTF Bold.otf",
	background_color="white",
	mask=imgArray
).generate_from_frequencies(dict(self.text_top_count))

plt 모듈을 이용하여 분석된 단어들이 어떤 사이즈에서 보여지게 되는지, 눈금자는 표시할지 등등의 설정을 해줍니다.
- ```
plt.figure(figsize = (10,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
```
이렇게 설정하면 워드 클라우드를 사용할 수 있습니다.

전체 코드

가장 많이 나온 단어

import re
from collections import Counter

from konlpy.tag import Okt

print('한국 소설 현진건의 운수 좋은 날에서 가장 많이 나온 단어 20개 추출')
print('불용어 제거하는 전처리 작업도 수행')

class TextCounter:
    def __init__(self, file_path):
        self.okt = Okt()
        self.file_path = file_path
        self.sentences_tag = [] # 형태소 분석 결과
        self.stop_word_list = [] # 불용어 리스트
        self.after_stop_word = [] # 불용어 제거된 결과
        self.result = [] # 가장 많이 나온 단어 10개 저장

    # okt 기준 형태소 분리
    def morpheme_separation_space(self):
        text = open(self.file_path, 'r', encoding='UTF-8').read()
        text = re.sub(r'[^\w\s]', '', text)

        # okt 함수를 통해 읽어 들인 내용의 형태소를 분석
        # 단어만 저장할 수 있도록 설정
        # self.sentences_tag = [word for word in self.okt.morphs(text) if word.strip() != '']
        for word in self.okt.morphs(text):
            if word.strip() != '':
                self.sentences_tag.append(word)

    # 따로 지정한 불용어 리스트 불러오기
    def stop_words_space(self):
        stop_words_list = open("C:/WorkSpace/Python/python-basic/blog/한국어_불용어.txt", 'r', encoding='UTF-8')

        # 텍스트 파일에 저장해둔 불용어 배열로 저장
        # 단어만 저장되도록 설정
        # self.stop_word_list = [word.strip() for word in stop_words_list]
        for word in stop_words_list:
            self.stop_word_list.append(word.strip())

    # 운수좋은날에서 불용어 제거
    def remove_stop_words(self):
        # 형태소 분리된 운수좋은날과 정리된 불용어를 사용하여
        # 운수좋은날에서 불용어 제거
        # self.after_stop_word = [word.strip() for word in self.sentences_tag if word not in self.stop_word_list]
        for word in self.sentences_tag:
            if word not in self.stop_word_list:
                self.after_stop_word.append(word.strip())

    def top_count(self):
        #가장 많이 나온 단어 10개 저장
        counts = Counter(self.after_stop_word)
        self.result.extend(counts.most_common(50))

        return self.result

    def get_final_result(self):
        self.morpheme_separation_space()
        self.stop_words_space()
        self.remove_stop_words()
        return self.top_count()

if __name__ == '__main__':
    text_class = TextCounter(r"C:\WorkSpace\Python\python-basic\blog\운수좋은날.txt")

    result = text_class.get_final_result()

    print(result)

워드 클라우드

import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import pandas as pd

from blog.wordCloudMostCommonWords import TextCounter


class WordCloudExample:
    def __init__(self):
        self.text_top_count = []

    def text_count(self):
        text_counter = TextCounter(r"C:\WorkSpace\Python\python-basic\blog\운수좋은날.txt")
        self.text_top_count = text_counter.get_final_result()

    def word_cloud(self):
        imgArray = np.array(Image.open('C:/WorkSpace/Python/python-basic/image/heart.png'))
        wc = WordCloud(
            font_path="C:/WorkSpace/Python/LaundryGothic_OTF/런드리고딕OTF Bold.otf",
            background_color="white",
            mask=imgArray
        ).generate_from_frequencies(dict(self.text_top_count))

        plt.figure(figsize = (10,10))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")
        plt.show()

if __name__ == '__main__':
    wordcloud = WordCloudExample()
    wordcloud.text_count()
    wordcloud.word_cloud()

* 폰트 에러 *

font-path 부분에 대한 설정이 없다면 에러가 발생하는 것은 아니지만 워드 클라우드가 표시될 때 글자는 안 보이고 모형들만 표시되는 경우가 있습니다. 그래서 폰트 지정을 해주는 것이 좋습니다.