R로 하는 감정 분석

R로 하는 감정 분석에 대해 알아보겠습니다. 감정분석할 텍스트를 로딩한 후 긍정과 부정을 분석하는 방법을 파이썬으로 구현해보도록 하겠습니다.

https://github.com/park1200656/KnuSentiLex

1. 감정 사전 활용하기

감정 사전은 아래 데이타를 다운로드 받습니다.

library(readr)
dic <- read_csv("knu_sentiment_lexicon.csv")

library(dplyr)

# 긍정 단어
dic %>% 
  filter(polarity == 2) %>% 
  arrange(word)

# 부정 단어
dic %>% 
  filter(polarity == -2) %>% 
  arrange(word)

감정 단어의 종류 살펴보기

dic %>% 
  filter(word %in% c("좋은", "나쁜"))

dic %>% 
  filter(word %in% c("기쁜", "슬픈"))

dic %>%
  filter(word %in% c("행복하다", "좌절하다"))

# 이모티콘
library(stringr)
dic %>% 
  filter(!str_detect(word, "[가-힣]")) %>% 
  arrange(word)

dic %>% 
  mutate(sentiment = ifelse(polarity >=  1, "pos",
                     ifelse(polarity <= -1, "neg", "neu"))) %>% 
  count(sentiment)

문장의 감정 점수 구하기

단어 기준으로 토큰화하기

unnest_tokens()으로 토큰화 진행, drop = F로 원문을 제거하지 않도록 함.

df <- tibble(sentence = c("디자인 예쁘고 마감도 좋아서 만족스럽다.",
                          "디자인은 괜찮다. 그런데 마감이 나쁘고 가격도 비싸다."))
df

library(tidytext)
df <- df %>% 
  unnest_tokens(input = sentence,
                output = word,
                token = "words",
                drop = F)

df

2. 단어에 감정 점수 부여하기

df <- df %>% 
  left_join(dic, by = "word") %>% 
  mutate(polarity = ifelse(is.na(polarity), 0, polarity))

df

3. 문장별로 감정 점수 합산하기

score_df <- df %>% 
  group_by(sentence) %>% 
  summarise(score  = sum(polarity))

score_df

2. 댓글 감정 분석하기

news_comment_parasite 다운로드

1) 기본적인 전처리

고유 번호 변수 만들기 : 내용이 같아도 구별할 수 있도록 mutate(), row_number()를 이용해 고유 번호 id를 추가함.
html 특수 문자 제거하기
- &nbsp 제거, textclean → replace_html() 이용해 html 태그를 공백으로 변경.
- stringr → str_squish()를 이용해 중복 공백 제거
감정분석을 위해 특수 문자와 두 글자 미만 단어 포함하기
glimpse() 데이터 구조를 요약해 보여 주는 dplyr 패키지의 함수 → 요약 결과를 줄을 맞춰 출력하기 때문에 str()보다 데이터 구조를 파악하기 좋음.

# 데이터 불러오기
raw_news_comment <- read_csv("news_comment_parasite.csv")

# 기본적인 전처리
install.packages("textclean")
library(textclean)

news_comment <- raw_news_comment %>%
  mutate(id = row_number(),
         reply = str_squish(replace_html(reply)))

# 데이터 구조 확인
glimpse(news_comment)

2) 단어 기준으로 토큰화하고 감정 점수 부여하기

# 토큰화
word_comment <- news_comment %>%
  unnest_tokens(input = reply,
                output = word,
                token = "words",
                drop = F)

word_comment %>%
  select(word, reply)

# 감정 점수 부여
word_comment <- word_comment %>%
  left_join(dic, by = "word") %>%
  mutate(polarity = ifelse(is.na(polarity), 0, polarity))

word_comment %>%
  select(word, polarity)

3) 자주 사용된 감정 단어 살펴보기

감정 분류하기

word_comment <- word_comment %>%
  mutate(sentiment = ifelse(polarity ==  2, "pos",
                     ifelse(polarity == -2, "neg", "neu")))

word_comment %>%
  count(sentiment)

2. 막대 그래프 만들기

top10_sentiment <- word_comment %>%
  filter(sentiment != "neu") %>%
  count(sentiment, word) %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10)

top10_sentiment

# 막대 그래프 만들기
library(ggplot2)
ggplot(top10_sentiment, aes(x = reorder(word, n), 
                            y = n, 
                            fill = sentiment)) +
  geom_col() +
  coord_flip() +
  geom_text(aes(label = n), hjust = -0.3) +
  facet_wrap(~ sentiment, scales = "free") +
  scale_y_continuous(expand = expansion(mult = c(0.05, 0.15))) +  
  labs(x = NULL) +
  theme(text = element_text(family = "nanumgothic"))

4) 댓글별 감정 점수 구하고 내용 살펴보기

댓글별 감정 점수 구하기

score_comment <- word_comment %>%
  group_by(id, reply) %>%
  summarise(score = sum(polarity)) %>%
  ungroup()

score_comment %>% 
  select(score, reply)

2. 감정 점수 높은 댓글 살펴보기

# 긍정 댓글
score_comment %>%
  select(score, reply) %>% 
  arrange(-score)

# 부정 댓글
score_comment %>%
  select(score, reply) %>% 
  arrange(score)

5) 감정 경향 살펴보기

감정 점수 빈도 구하기

score_comment %>%
  count(score)

2. 감정 분류하고 막대 그래프 만들기

score_comment <- score_comment %>%
  mutate(sentiment = ifelse(score >=  1, "pos",
                     ifelse(score <= -1, "neg", "neu")))

scale_x_discrete()는 x축 순서를 정하는 기능으로 따로 정하지 않으면 항목의 알파벳 순서로 정렬된다.
샘플 데이터로 비율 누적 막대 그래프 만들기

df <- tibble(contry = c("Korea", "Korea", "Japen", "Japen"),  # 축
             sex = c("M", "F", "M", "F"),                     # 누적 막대
             ratio = c(60, 40, 30, 70))                       # 값
df

ggplot(df, aes(x = contry, y = ratio, fill = sex)) + geom_col()

ggplot(df, aes(x = contry, y = ratio, fill = sex)) + 
  geom_col() +
  geom_text(aes(label = paste0(ratio, "%")),          # % 표시
            position = position_stack(vjust = 0.5))   # 가운데 표시

댓글의 감정 비율로 누적 막대 그래프 만들기

# 더미 변수 생성
frequency_score$dummy <- 0
frequency_score

ggplot(frequency_score, aes(x = dummy, y = ratio, fill = sentiment)) +
  geom_col() +
  geom_text(aes(label = paste0(round(ratio, 1), "%")),      
              position = position_stack(vjust = 0.5)) + 
  theme(axis.title.x = element_blank(),  # x축 이름 삭제
        axis.text.x  = element_blank(),  # x축 값 삭제
        axis.ticks.x = element_blank())  # x축 눈금 삭제

3. 감정 범주별 주요 단어 살펴보기

1) 감정 범주별 단어 빈도 구하기

토큰화하고 두 글자 이상 한글 단어만 남기기

comment <- score_comment %>%
  unnest_tokens(input = reply,          # 단어 기준 토큰화
                output = word,
                token = "words",
                drop = F) %>%
  filter(str_detect(word, "[가-힣]") &  # 한글 추출
         str_count(word) >= 2)          # 두 글자 이상 추출

2. 감정 범주별 단어 빈도 구하기

# 감정 및 단어별 빈도 구하기
frequency_word <- comment %>%
  filter(str_count(word) >= 2) %>%
  count(sentiment, word, sort = T)

# -------------------------------------------------------------------------
# 긍정 댓글 고빈도 단어
frequency_word %>%
  filter(sentiment == "pos")

# 부정 댓글 고빈도 단어
frequency_word %>%
  filter(sentiment == "neg")

2) 상대적으로 자주 사용된 단어 비교하기

로그 오즈비 구하기

library(tidyr)
comment_wide <- frequency_word %>%
  filter(sentiment != "neu") %>%  # 중립 제외
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = list(n = 0))

comment_wide

# 로그 오즈비 구하기
comment_wide <- comment_wide %>%
  mutate(log_odds_ratio = log(((pos + 1) / (sum(pos + 1))) /
                              ((neg + 1) / (sum(neg + 1)))))

comment_wide

2. 로그 오즈비가 가장 큰 단어 10개씩 추출하기

top10 <- comment_wide %>%
  group_by(sentiment = ifelse(log_odds_ratio > 0, "pos", "neg")) %>%
  slice_max(abs(log_odds_ratio), n = 10)

top10 %>% print(n = Inf)

top10 <- comment_wide %>%
  group_by(sentiment = ifelse(log_odds_ratio > 0, "pos", "neg")) %>%
  slice_max(abs(log_odds_ratio), n = 10, with_ties = F)

top10

3. 막대 그래프 만들기

# 막대 그래프 만들기
ggplot(top10, aes(x = reorder(word, log_odds_ratio),
                      y = log_odds_ratio,
                      fill = sentiment)) +
  geom_col() +
  coord_flip() +
  labs(x = NULL) +
  theme(text = element_text(family = "nanumgothic"))

4. 감정 사전 수정하기

1) 감정 단어가 사용된 원문 살펴보기

# "소름"이 사용된 댓글
score_comment %>%
  filter(str_detect(reply, "소름")) %>%
  select(reply)

# "미친"이 사용된 댓글
score_comment %>%
  filter(str_detect(reply, "미친")) %>%
  select(reply)

dic %>% filter(word %in% c("소름", "소름이", "미친"))

2) 감정 사전 수정하기

new_dic <- dic %>%
  mutate(polarity = ifelse(word %in% c("소름", "소름이", "미친"), 2, polarity))

new_dic %>% filter(word %in% c("소름", "소름이", "미친"))

3) 수정한 사전으로 감정 점수 부여하기

new_word_comment <- word_comment %>%
  select(-polarity) %>%
  left_join(new_dic, by = "word") %>%
  mutate(polarity = ifelse(is.na(polarity), 0, polarity))

4) 댓글별 감정 점수 구하기

new_score_comment <- new_word_comment %>%
  group_by(id, reply) %>%
  summarise(score = sum(polarity)) %>%
  ungroup()

new_score_comment %>%
  select(score, reply) %>%
  arrange(-score)

5) 감정 경향 살펴보기

감정 분류하기

# 1점 기준으로 긍정 중립 부정 분류
new_score_comment <- new_score_comment %>%
  mutate(sentiment = ifelse(score >=  1, "pos",
                     ifelse(score <= -1, "neg", "neu")))

2. 감정 범주별 빈도와 비율 구하기

# 원본 감정 사전 활용
score_comment %>%
  count(sentiment) %>%
  mutate(ratio = n/sum(n)*100)

# 수정한 감정 사전 활용
new_score_comment %>%
  count(sentiment) %>%
  mutate(ratio = n/sum(n)*100)

word <- "소름|소름이|미친"

# 원본 감정 사전 활용
score_comment %>%
  filter(str_detect(reply, word)) %>%
  count(sentiment)

# 수정한 감정 사전 활용
new_score_comment %>%
  filter(str_detect(reply, word)) %>%
  count(sentiment)

# -------------------------------------------------------------------------
df <- tibble(sentence = c("이번 에피소드 쩐다", 
                          "이 영화 핵노잼")) %>% 
  unnest_tokens(input = sentence, 
                output = word, 
                token = "words", 
                drop = F)

df %>% 
  left_join(dic, by = "word") %>%
  mutate(polarity = ifelse(is.na(polarity), 0, polarity)) %>% 
  group_by(sentence) %>% 
  summarise(score = sum(polarity))

# 신조어 목록 생성
newword <- tibble(word = c("쩐다", "핵노잼"), 
                  polarity = c(2, -2))

# 사전에 신조어 추가
newword_dic <- bind_rows(dic, newword)

# 새 사전으로 감정 점수 부여
df %>% 
  left_join(newword_dic, by = "word") %>%
  mutate(polarity = ifelse(is.na(polarity), 0, polarity)) %>% 
  group_by(sentence) %>% 
  summarise(score = sum(polarity))

6) 감정 범주별 주요 단어 살펴보기

두 글자 이상 한글 단어만 남기고 단어 빈도 구하기

# 토큰화 및 전처리
new_comment <- new_score_comment %>%
  unnest_tokens(input = reply,
                output = word,
                token = "words",
                drop = F) %>%
  filter(str_detect(word, "[가-힣]") &
           str_count(word) >= 2)

# 감정 및 단어별 빈도 구하기
new_frequency_word <- new_comment %>%
  count(sentiment, word, sort = T)

2. 로그 오즈비 구하기

# Wide form으로 변환
new_comment_wide <- new_frequency_word %>%
  filter(sentiment != "neu") %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = list(n = 0))

# 로그 오즈비 구하기
new_comment_wide <- new_comment_wide %>%
  mutate(log_odds_ratio = log(((pos + 1) / (sum(pos + 1))) /
                              ((neg + 1) / (sum(neg + 1)))))

3. 로그 오즈비가 큰 단어로 막대 그래프 만들기

new_top10 <- new_comment_wide %>%
  group_by(sentiment = ifelse(log_odds_ratio > 0, "pos", "neg")) %>%
  slice_max(abs(log_odds_ratio), n = 10, with_ties = F)

# 막대 그래프 만들기
ggplot(new_top10, aes(x = reorder(word, log_odds_ratio),
                      y = log_odds_ratio,
                      fill = sentiment)) +
  geom_col() +
  coord_flip() +
  labs(x = NULL) +
  theme(text = element_text(family = "nanumgothic"))

4. 주요 단어가 사용된 댓글 살펴보기

# 긍정 댓글 원문
new_score_comment %>%
  filter(sentiment == "pos" & str_detect(reply, "축하")) %>%
  select(reply)

new_score_comment %>%
  filter(sentiment == "pos" & str_detect(reply, "소름")) %>%
  select(reply)

# -------------------------------------------------------------------------
# 부정 댓글 원문
new_score_comment %>%
  filter(sentiment == "neg" & str_detect(reply, "좌빨")) %>%
  select(reply)

new_score_comment %>%
  filter(sentiment == "neg" & str_detect(reply, "못한")) %>%
  select(reply)

형태소 분석기에 대해 궁금하신 분은 여기 링크를 참고바랍니다.

R 감정분석 감정사전