rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 507710 27.2 1128478 60.3 644242 34.5
## Vcells 903102 6.9 8388608 64.0 1636209 12.5
#====================================#
# 환경 구성 라이브러리 조성 #
# Adding basic Library Environment #
#====================================#
# 1) loading and writing Text or Excel files
library(readtext)# A package for reading and handling text data in various formats like plain text, CSV, JSON, or XML.
library(readxl) # A library for reading Excel files (both .xls and .xlsx formats) into R.
library(writexl) # A package to write data to Excel files (.xlsx) without any dependencies on Java or other external libraries.
# library(pdftools) # A library for extracting text and metadata from PDF files.
# 2) Text Cleansing
library(textclean) # A package providing functions for cleaning and preprocessing text data.
# 3) LSE Mining Tool
library(quanteda)
## Package version: 3.3.1
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## 다음의 패키지를 부착합니다: 'quanteda'
## The following object is masked from 'package:readtext':
##
## texts
library(quanteda.textstats)
library(quanteda.textmodels)
library(quanteda.textplots)
library(SnowballC)
library(tidytext)
# 4) A collection of R packages, including ggplot2, dplyr, and tidyr, designed for data science workflows.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# 5) Stopwords library
library(stopwords)
# 6) plot library
library(igraph)
##
## 다음의 패키지를 부착합니다: 'igraph'
##
## The following objects are masked from 'package:lubridate':
##
## %--%, union
##
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
##
## The following objects are masked from 'package:purrr':
##
## compose, simplify
##
## The following object is masked from 'package:tidyr':
##
## crossing
##
## The following object is masked from 'package:tibble':
##
## as_data_frame
##
## The following object is masked from 'package:quanteda.textplots':
##
## as.igraph
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union
library(ggraph)
# 7) Topic Model library
library(topicmodels)
library(seededlda)
## 필요한 패키지를 로딩중입니다: proxyC
##
## 다음의 패키지를 부착합니다: 'proxyC'
##
## The following object is masked from 'package:stats':
##
## dist
##
##
## 다음의 패키지를 부착합니다: 'seededlda'
##
## The following objects are masked from 'package:topicmodels':
##
## terms, topics
##
## The following object is masked from 'package:igraph':
##
## sizes
##
## The following object is masked from 'package:stats':
##
## terms
# 8) Sampling Library
library(rsample)
# 9) Extra Fonts
library(extrafont)
## Registering fonts with R
# 10) Managing dates
library(lubridate)
# 11) Sentiment Analysis
library(sentometrics)
library(scales)
##
## 다음의 패키지를 부착합니다: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(syuzhet)
##
## 다음의 패키지를 부착합니다: 'syuzhet'
##
## The following object is masked from 'package:scales':
##
## rescale
theme_set(theme_gray(base_family='NanumGothic'))
library(RmecabKo)
library(RcppMeCab)
##
## 다음의 패키지를 부착합니다: 'RcppMeCab'
##
## The following object is masked from 'package:RmecabKo':
##
## pos
library(tidyverse)
library(topicmodels)
library(ggplot2)
library(dplyr)
library('showtext')
## 필요한 패키지를 로딩중입니다: sysfonts
## 필요한 패키지를 로딩중입니다: showtextdb
##
## 다음의 패키지를 부착합니다: 'showtextdb'
##
## The following object is masked from 'package:extrafont':
##
## font_install
showtext_auto()
library("writexl")
library(ggwordcloud)
df <- readxl::read_excel("df5.xlsx")
df<-unite(df, col='phy1', c('phy1', 'phy2'), sep='. ')
df_phy<-df%>% select(no, phy1, ph2, re_female, re_age, re_spouse, re_health, re_hosp_yr, care_age, care_parent, si_rel, care_univ, physical, economic, social, dependet, emotional, institute)
df_phy<-df%>%
select(no, phy1, institute, physical)
dfphy_tk <- df_phy %>%
# 문자 혹은 공백 이외 것 제거
mutate(text = str_remove_all(phy1, "[^(\\w+|\\s)]")) %>%
# 메카브로 품사 중 명사만 추출
unnest_tokens(word, text, token = RcppMeCab::pos, drop = FALSE) %>%
separate(col = word,
into = c("word", "morph"),
sep = "/" ) %>%
filter(morph == "nng")
dfphy_tk %>% glimpse()
## Rows: 352
## Columns: 7
## $ no <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ phy1 <chr> "안만 해도 환자이기 때문에 이동할 때가 좀 크죠\r\n\r\n. 이동…
## $ institute <chr> "시설", "시설", "시설", "시설", "시설", "시설", "시설", "시…
## $ physical <chr> "하", "하", "하", "하", "하", "하", "하", "하", "하", "하", …
## $ text <chr> "안만 해도 환자이기 때문에 이동할 때가 좀 크죠\r\n\r\n 이동…
## $ word <chr> "환자", "이동", "때", "이동", "때", "요양원", "신체", "케어"…
## $ morph <chr> "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng…
dfphy_tk$no <- as.numeric(dfphy_tk$no)
## BOW 데이터 변환
dfphy_bow <- df_phy %>%
## 문서 내 텍스트에서 명사 추출
mutate(mecab_pos = map(phy1, RcppMeCab::pos) ) %>%
unnest(mecab_pos) %>%
unnest(mecab_pos) %>%
separate(mecab_pos, into = c("nouns", "pos"), sep = "/") %>%
filter(pos == "NNG") %>%
## 문서별 명사 빈도수
group_by(no) %>%
count(nouns, sort = TRUE)
dfphy_bow$no <- as.numeric(dfphy_bow$no)
dfphy2 <- inner_join(dfphy_bow, dfphy_tk)
## Joining with `by = join_by(no)`
collapsed_dfphy2 <- dfphy2 %>%
group_by(nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
library(gt)
collapsed_dfphy2$total <- sum(collapsed_dfphy2$count)
collapsed_dfphy2$freq <- collapsed_dfphy2$count/collapsed_dfphy2$total
freq_phy_all<-collapsed_dfphy2 %>%
filter(count>29)
freq_phy_all <- freq_phy_all[order(-freq_phy_all$count), ]
write_xlsx(freq_phy_all, "freq_phy_all.xlsx")
collapsed_dfphy2 <- dfphy2 %>%
group_by(institute, nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
## `summarise()` has grouped output by 'institute'. You can override using the
## `.groups` argument.
png("phy_plot.png")
phy_plot<- ggplot(data = freq_phy_all, aes(label = nouns, size = count, color = count))+
geom_text_wordcloud_area(area_corr_power = 1, eccentricity = 1)+
scale_size_area(max_size = 20) +
scale_color_gradient(low = "#C7DDEC", high = "#1F78B4") +
theme_minimal()
print(phy_plot)
dev.off()
## png
## 2
frequency <- collapsed_dfphy2 %>%
left_join(collapsed_dfphy2 %>% group_by(institute)%>%
summarize(total=sum(count, na.rm=TRUE))) %>%
mutate(freq = count/total)
## Joining with `by = join_by(institute)`
frequency<- frequency[order(frequency$institute, -frequency$freq), ]
frequency
write_xlsx(freq_phy_all, "freq_phy_by_institute.xlsx")
summary(frequency)
## institute nouns count total
## Length:191 Length:191 Min. : 2.00 Min. : 675
## Class :character Class :character 1st Qu.: 14.00 1st Qu.:1655
## Mode :character Mode :character Median : 21.00 Median :1655
## Mean : 33.55 Mean :2488
## 3rd Qu.: 30.00 3rd Qu.:4079
## Max. :332.00 Max. :4079
## freq
## Min. :0.0007355
## 1st Qu.:0.0066465
## Median :0.0096677
## Mean :0.0157068
## 3rd Qu.:0.0205685
## Max. :0.1166163
summary(frequency$count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 14.00 21.00 33.55 30.00 332.00
library(ggplot2)
ggplot(frequency, aes(freq, fill = institute)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.20) +
facet_wrap(~institute, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
frequency1_tf_idf <- frequency %>%
bind_tf_idf(nouns, institute, count)
frequency1_tf_idf
frequency1_tf_idf <-frequency1_tf_idf %>%
select(-total) %>%
arrange(institute, desc(tf_idf))
write_xlsx(frequency1_tf_idf, "freq_tf_idf_phy_by_institute.xlsx")
library(forcats)
frequency1_tf_idf %>%
group_by(institute) %>%
slice_max(tf_idf, n = 7) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(nouns, tf_idf), fill = institute)) +
geom_col(show.legend = FALSE) +
facet_wrap(~institute, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
library(tidyr)
table(frequency$institute)
##
## 가정내 시설 재가급여
## 72 38 81
frequency <- frequency %>%
select(institute, nouns, freq) %>%
pivot_wider(names_from = institute, values_from = freq)
frequency
word_ratios <- collapsed_dfphy2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/가정내)) %>%
arrange(desc(logratio))
word_ratios %>%
arrange(abs(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/가정내)") +
scale_fill_discrete(name = "", labels = c("요양시설", "가정내"))
word_ratios2 <- collapsed_dfphy2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/재가급여)) %>%
arrange(desc(logratio))
word_ratios2 %>%
arrange(abs(logratio))
word_ratios2 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/재가급여)") +
scale_fill_discrete(name = "", labels = c("요양시설", "재가급여"))
word_ratios3 <- collapsed_dfphy2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(재가급여/시설)) %>%
arrange(desc(logratio))
word_ratios3 %>%
arrange(abs(logratio))
word_ratios3 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (재가급여/요양시설)") +
scale_fill_discrete(name = "", labels = c("재가급여", "요양시설"))
Blei 등 일군의 전산학자들은 문서 내 단어의 확률분포를 계산해 찾아낸 일련의 단어 군집을 통해 문서의 주제를 추론하는 방법으로서 LDA(Latent Dirichlet Allocation)을 제시했다.
19세기 독일 수학자 러죈 디리클레(Lejeune Dirichlet, 1805 ~ 1859)가 제시한 디리클래 분포(Dirichlet distribution)를 이용해 문서에 잠재된 주제를 추론하기에 잠재 디리클레 할당(LDA: Latent Dirichlet Allocation)이라고 했다. 문서의 주제를 추론하는 방법이므로 주제모형(topic models)이라고 한다.
LDA에는 다음과 같은 전제가 있다.
말뭉치에는 단어를 통해 분포된 다수의 주제가 있다 (위 그림의 가장 왼쪽). 각 문서에서 주제를 생성하는 과정은 다음과 같다.
LDA에서 정의하는 주제(topic)는 특정 단어에 대한 분포다. 예를 들어, 유전학 주제라면 유전학에 대하여 높은 확률로 분포하는 단어들이고, 진화생물학 주제라면 진화생물학에 대하여 높은 확률로 분포하는 단어들이다.
LDA에서는 문서를 주머니에 무작위로 섞여 있는 임의의 혼합물로 본다(Bag of words). 일반적으로 사용하는 문장처럼 문법이라는 짜임새있는 구조로 보는 것이 아니다. 임의의 혼합물이지만 온전하게 무작위로 섞여 있는 것은 아니다. 서로 함께 모여 있는 군집이 확률적으로 존재한다. 즉, 주제모형에서 접근하는 문서는 잠재된 주제의 혼합물로서, 각 주제를 구성하는 단어 단위가 확률적으로 혼합된 주머니(bag)인 셈이다.
주제모형의 목표는 말뭉치에서 주제의 자동추출이다. 문서 자체는 관측가능하지만, 주제의 구조(문서별 주제의 분포와 문서별-단어별 주제할당)는 감춰져 있다. 감춰진 주제의 구조를 찾아내는 작업은 뒤집어진 생성과정이라고 할 수 있다. 관측된 말뭉치를 생성하는 감춰진 구조를 찾아내는 작업이기 때문이다. 문서에 대한 사전 정보없이 문서의 주제를 분류하기 때문에 주제모형은 비지도학습(unsupervised learning) 방식의 기계학습(machine learning)이 된다.
기계학습(machine learing): 인공지능 작동방식. 투입한 데이터에서 규칙성 탐지해 분류 및 예측. 지도학습, 비지도학습, 강화학습 등으로 구분.
지도학습(supervised learning): 인간이 사전에 분류한 결과를 학습해 투입한 자료에서 규칙성 혹은 경향 발견
비지도학습(unsupervised learning): 사전분류한 결과 없이 기계 스스로 투입한 자료에서 규칙성 혹은 경향 발견
강화학습(reinforced learning): 행동의 결과에 대한 피드백(보상, 처벌 등)을 통해 투입한 자료에서 규칙성혹은 경향 발견
주제모형의 효용은 대량의 문서에서 의미구조를 닮은 주제구조를 추론해 주석을 자동으로 부여할 수 있다는데 있다.
주제모형은 다양한 패지키가 있다.
topicmodels 패키지의 LDA() 함수를 사용하여 k = 3로 설정하여 두 가지 주제의 LDA 모델을 만들 수 있습니다.
실제로 거의 모든 주제 모델은 더 큰 k를 사용하지만 이 분석 접근 방식이 더 많은 수의 주제로 확장된다는 사실을 곧 알게 될 것입니다.
이 함수는 단어가 주제와 연결되는 방식, 주제가 문서와 연결되는 방식 등 모델 피팅의 전체 세부정보가 포함된 개체를 반환합니다.
## DTM 변환
dfphy_dtm <- dfphy_bow %>%
cast_dtm(document = no, term = nouns, value = n) %>%
as.matrix
## LDA 모형 적합
dfphy_lda <- LDA(dfphy_dtm, k = 3, method="Gibbs",
control=list(alpha=0.1, delta=0.1, seed=1357))
dfphy_topics <- tidy(dfphy_lda, matrix = "beta")
dfphy_topics
dfphy_top_terms <- dfphy_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
dfphy_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
* 주제 내에서 가장 일반적인 10개의 용어를 찾을 수 있습니다.그림으로
보여줍니다. * 이 부분은 정리해서 논문에 그림을 넣어주세요..
## 토픽 결과 - 행렬
tidy(dfphy_lda, matrix="gamma") %>%
arrange(document) %>%
spread(topic, gamma)
library(tidyr)
beta_wide <- dfphy_topics %>%
mutate(topic = paste0("topic", topic)) %>%
pivot_wider(names_from = topic, values_from = beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_wide
dfphy_gamma <- tidy(dfphy_lda, matrix="gamma")
dfphy_gamma$no=as.numeric(dfphy_gamma$document)
dfphy_gamma <- left_join(dfphy_gamma, dfphy_tk)
## Joining with `by = join_by(no)`
dfphy_gamma %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ institute) +
labs(x = "topic", y = expression(gamma))
dfphy_gamma$physical= as.factor(dfphy_gamma$physical)
dfphy_gamma %>%
mutate(physical = reorder(physical, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ physical) +
labs(x = "topic", y = expression(gamma))
tidy(dfphy_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .9) | (topic==2 & gamma > .9) |(topic==3 & gamma > .9))
dfphy_topic_g <- tidy(dfphy_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .9) | (topic==2 & gamma > .9) |(topic==3 & gamma > .9)) %>%
mutate(topic = as.factor(topic)) %>%
ggplot(aes(x = document, y=gamma)) +
geom_col(aes(fill = topic), position=position_dodge()) +
scale_fill_manual(values = c("orange", "midnightblue", "blue")) +
theme_light() +
labs(title="신체적부담 토픽모형",
subtitle = "문서 토픽 행렬")
dfphy_topic_g
rm(list=ls())
df <- readxl::read_excel("df5.xlsx")
df<-unite(df, col='eco1', c('eco1', 'eco2'), sep='. ')
df_eco<-df%>%
select(no, eco1, institute, economic)
dfeco_tk <- df_eco %>%
# 문자 혹은 공백 이외 것 제거
mutate(text = str_remove_all(eco1, "[^(\\w+|\\s)]")) %>%
# 메카브로 품사 중 명사만 추출
unnest_tokens(word, text, token = RcppMeCab::pos, drop = FALSE) %>%
separate(col = word,
into = c("word", "morph"),
sep = "/" ) %>%
filter(morph == "nng")
dfeco_tk %>% glimpse()
## Rows: 241
## Columns: 7
## $ no <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ eco1 <chr> "갑작스럽게 다쳤을 때 병원에 입원하면은 간병비가 좀 부담스러…
## $ institute <chr> "시설", "시설", "시설", "시설", "시설", "시설", "시설", "시…
## $ economic <chr> "상", "상", "상", "상", "상", "상", "상", "상", "중", "중", …
## $ text <chr> "갑작스럽게 다쳤을 때 병원에 입원하면은 간병비가 좀 부담스러…
## $ word <chr> "때", "병원", "입원", "병비", "부담", "가정", "경제", "부담"…
## $ morph <chr> "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng…
dfeco_tk$no <- as.numeric(dfeco_tk$no)
## BOW 데이터 변환
dfeco_bow <- df_eco %>%
## 문서 내 텍스트에서 명사 추출
mutate(mecab_pos = map(eco1, RcppMeCab::pos) ) %>%
unnest(mecab_pos) %>%
unnest(mecab_pos) %>%
separate(mecab_pos, into = c("nouns", "pos"), sep = "/") %>%
filter(pos == "NNG") %>%
## 문서별 명사 빈도수
group_by(no) %>%
count(nouns, sort = TRUE)
dfeco_bow$no <- as.numeric(dfeco_bow$no)
dfeco2 <- inner_join(dfeco_bow, dfeco_tk)
## Joining with `by = join_by(no)`
collapsed_dfeco2 <- dfeco2 %>%
group_by(nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
collapsed_dfeco2$total <- sum(collapsed_dfeco2$count)
collapsed_dfeco2$freq <- collapsed_dfeco2$count/collapsed_dfeco2$total
freq_eco_all<-collapsed_dfeco2 %>%
filter(count>29)
freq_eco_all <- freq_eco_all[order(-freq_eco_all$count), ]
write_xlsx(freq_eco_all, "freq_eco_all.xlsx")
collapsed_dfeco2 <- dfeco2 %>%
group_by(institute, nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
## `summarise()` has grouped output by 'institute'. You can override using the
## `.groups` argument.
png("eco_plot.png")
eco_plot<- ggplot(data = freq_eco_all, aes(label = nouns, size = count, color = count))+
geom_text_wordcloud_area(area_corr_power = 1, eccentricity = 1)+
scale_size_area(max_size = 20) +
scale_color_gradient(low = "#C7DDEC", high = "#1F78B4") +
theme_minimal()
print(eco_plot)
dev.off()
## png
## 2
frequency <- collapsed_dfeco2 %>%
left_join(collapsed_dfeco2 %>% group_by(institute)%>%
summarize(total=sum(count, na.rm=TRUE))) %>%
mutate(freq = count/total)
## Joining with `by = join_by(institute)`
frequency<- frequency[order(frequency$institute, -frequency$freq), ]
frequency
write_xlsx(freq_eco_all, "freq_eco_by_institute.xlsx")
summary(frequency)
## institute nouns count total
## Length:161 Length:161 Min. : 2.00 Min. : 747
## Class :character Class :character 1st Qu.: 11.00 1st Qu.: 747
## Mode :character Mode :character Median : 14.00 Median : 806
## Mean : 32.86 Mean :1883
## 3rd Qu.: 38.00 3rd Qu.:3738
## Max. :522.00 Max. :3738
## freq
## Min. :0.0008026
## 1st Qu.:0.0074442
## Median :0.0155163
## Mean :0.0186335
## 3rd Qu.:0.0235732
## Max. :0.1396469
summary(frequency$count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 11.00 14.00 32.86 38.00 522.00
library(ggplot2)
ggplot(frequency, aes(freq, fill = institute)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.20) +
facet_wrap(~institute, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
frequency1_tf_idf <- frequency %>%
bind_tf_idf(nouns, institute, count)
frequency1_tf_idf
frequency1_tf_idf <-frequency1_tf_idf %>%
select(-total) %>%
arrange(institute, desc(tf_idf))
write_xlsx(frequency1_tf_idf, "freq_tf_idf_eco_by_institute.xlsx")
library(forcats)
frequency1_tf_idf %>%
group_by(institute) %>%
slice_max(tf_idf, n = 7) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(nouns, tf_idf), fill = institute)) +
geom_col(show.legend = FALSE) +
facet_wrap(~institute, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
library(tidyr)
table(frequency$institute)
##
## 가정내 시설 재가급여
## 59 42 60
frequency <- frequency %>%
select(institute, nouns, freq) %>%
pivot_wider(names_from = institute, values_from = freq)
frequency
word_ratios <- collapsed_dfeco2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/가정내)) %>%
arrange(desc(logratio))
word_ratios %>%
arrange(abs(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/가정내)") +
scale_fill_discrete(name = "", labels = c("요양시설", "가정내"))
word_ratios2 <- collapsed_dfeco2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/재가급여)) %>%
arrange(desc(logratio))
word_ratios2 %>%
arrange(abs(logratio))
word_ratios2 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/재가급여)") +
scale_fill_discrete(name = "", labels = c("요양시설", "재가급여"))
word_ratios3 <- collapsed_dfeco2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(재가급여/시설)) %>%
arrange(desc(logratio))
word_ratios3 %>%
arrange(abs(logratio))
word_ratios3 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (재가급여/요양시설)") +
scale_fill_discrete(name = "", labels = c("재가급여", "요양시설"))
## DTM 변환
dfeco_dtm <- dfeco_bow %>%
cast_dtm(document = no, term = nouns, value = n) %>%
as.matrix
## LDA 모형 적합
dfeco_lda <- LDA(dfeco_dtm, k = 3, method="Gibbs",
control=list(alpha=0.1, delta=0.1, seed=1357))
dfeco_topics <- tidy(dfeco_lda, matrix = "beta")
dfeco_topics
dfeco_top_terms <- dfeco_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
dfeco_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
## 토픽 결과 - 행렬
tidy(dfeco_lda, matrix="gamma") %>%
arrange(document) %>%
spread(topic, gamma)
library(tidyr)
beta_wide <- dfeco_topics %>%
mutate(topic = paste0("topic", topic)) %>%
pivot_wider(names_from = topic, values_from = beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_wide
dfeco_gamma <- tidy(dfeco_lda, matrix="gamma")
dfeco_gamma$no=as.numeric(dfeco_gamma$document)
dfeco_gamma <- left_join(dfeco_gamma, dfeco_tk)
## Joining with `by = join_by(no)`
dfeco_gamma %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ institute) +
labs(x = "topic", y = expression(gamma))
dfeco_gamma$economic= as.factor(dfeco_gamma$economic)
dfeco_gamma %>%
mutate(eoconomic = reorder(economic, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ economic) +
labs(x = "topic", y = expression(gamma))
tidy(dfeco_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .8) | (topic==2 & gamma > .7) |(topic==3 & gamma > .9))
dfeco_topic_g <- tidy(dfeco_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .8) | (topic==2 & gamma > .7) |(topic==3 & gamma > .9)) %>%
mutate(topic = as.factor(topic)) %>%
ggplot(aes(x = document, y=gamma)) +
geom_col(aes(fill = topic), position=position_dodge()) +
scale_fill_manual(values = c("orange", "midnightblue", "blue")) +
theme_light() +
labs(title="경제적부담 토픽모형",
subtitle = "문서 토픽 행렬")
dfeco_topic_g
rm(list=ls())
df <- readxl::read_excel("df5.xlsx")
df<-unite(df, col='soc1', c('soc1', 'soc2'), sep='. ')
df_soc<-df%>%
select(no, soc1, institute, social)
dfsoc_tk <- df_soc %>%
# 문자 혹은 공백 이외 것 제거
mutate(text = str_remove_all(soc1, "[^(\\w+|\\s)]")) %>%
# 메카브로 품사 중 명사만 추출
unnest_tokens(word, text, token = RcppMeCab::pos, drop = FALSE) %>%
separate(col = word,
into = c("word", "morph"),
sep = "/" ) %>%
filter(morph == "nng")
dfsoc_tk %>% glimpse()
## Rows: 250
## Columns: 7
## $ no <dbl> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, …
## $ soc1 <chr> "있습니다.. 아무래도 직장생활을 하기에 업무가 밀리기 때문에 …
## $ institute <chr> "시설", "시설", "시설", "시설", "시설", "시설", "시설", "시…
## $ social <chr> "하", "하", "하", "하", "하", "상", "상", "상", "상", "상", …
## $ text <chr> "있습니다 아무래도 직장생활을 하기에 업무가 밀리기 때문에 제…
## $ word <chr> "직장", "생활", "업무", "삶", "여유", "약속", "일", "상황", …
## $ morph <chr> "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng…
dfsoc_tk$no <- as.numeric(dfsoc_tk$no)
## BOW 데이터 변환
dfsoc_bow <- df_soc %>%
## 문서 내 텍스트에서 명사 추출
mutate(mecab_pos = map(soc1, RcppMeCab::pos) ) %>%
unnest(mecab_pos) %>%
unnest(mecab_pos) %>%
separate(mecab_pos, into = c("nouns", "pos"), sep = "/") %>%
filter(pos == "NNG") %>%
## 문서별 명사 빈도수
group_by(no) %>%
count(nouns, sort = TRUE)
dfsoc_bow$no <- as.numeric(dfsoc_bow$no)
dfsoc2 <- inner_join(dfsoc_bow, dfsoc_tk)
## Joining with `by = join_by(no)`
collapsed_dfsoc2 <- dfsoc2 %>%
group_by(nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
collapsed_dfsoc2$total <- sum(collapsed_dfsoc2$count)
collapsed_dfsoc2$freq <- collapsed_dfsoc2$count/collapsed_dfsoc2$total
freq_soc_all<-collapsed_dfsoc2 %>%
filter(count>29)
freq_soc_all <- freq_soc_all[order(-freq_soc_all$count), ]
write_xlsx(freq_soc_all, "freq_soc_all.xlsx")
collapsed_dfsoc2 <- dfsoc2 %>%
group_by(institute, nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
## `summarise()` has grouped output by 'institute'. You can override using the
## `.groups` argument.
png("soc_plot.png")
soc_plot<- ggplot(data = freq_soc_all, aes(label = nouns, size = count, color = count))+
geom_text_wordcloud_area(area_corr_power = 1, eccentricity = 1)+
scale_size_area(max_size = 20) +
scale_color_gradient(low = "#C7DDEC", high = "#1F78B4") +
theme_minimal()
print(soc_plot)
dev.off()
## png
## 2
frequency <- collapsed_dfsoc2 %>%
left_join(collapsed_dfsoc2 %>% group_by(institute)%>%
summarize(total=sum(count, na.rm=TRUE))) %>%
mutate(freq = count/total)
## Joining with `by = join_by(institute)`
frequency<- frequency[order(frequency$institute, -frequency$freq), ]
frequency
write_xlsx(freq_soc_all, "freq_soc_by_institute.xlsx")
summary(frequency)
## institute nouns count total
## Length:176 Length:176 Min. : 1.00 Min. : 784
## Class :character Class :character 1st Qu.: 11.00 1st Qu.:1118
## Mode :character Mode :character Median : 14.00 Median :1118
## Mean : 20.41 Mean :1251
## 3rd Qu.: 28.00 3rd Qu.:1690
## Max. :128.00 Max. :1690
## freq
## Min. :0.0005917
## 1st Qu.:0.0076923
## Median :0.0143615
## Mean :0.0170455
## 3rd Qu.:0.0189349
## Max. :0.0854592
summary(frequency$count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 11.00 14.00 20.41 28.00 128.00
library(ggplot2)
ggplot(frequency, aes(freq, fill = institute)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.20) +
facet_wrap(~institute, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
frequency1_tf_idf <- frequency %>%
bind_tf_idf(nouns, institute, count)
frequency1_tf_idf
frequency1_tf_idf <-frequency1_tf_idf %>%
select(-total) %>%
arrange(institute, desc(tf_idf))
write_xlsx(frequency1_tf_idf, "freq_tf_idf_soc_by_institute.xlsx")
library(forcats)
frequency1_tf_idf %>%
group_by(institute) %>%
slice_max(tf_idf, n = 7) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(nouns, tf_idf), fill = institute)) +
geom_col(show.legend = FALSE) +
facet_wrap(~institute, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
library(tidyr)
table(frequency$institute)
##
## 가정내 시설 재가급여
## 67 43 66
frequency <- frequency %>%
select(institute, nouns, freq) %>%
pivot_wider(names_from = institute, values_from = freq)
frequency
word_ratios <- collapsed_dfsoc2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/가정내)) %>%
arrange(desc(logratio))
word_ratios %>%
arrange(abs(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/가정내)") +
scale_fill_discrete(name = "", labels = c("요양시설", "가정내"))
word_ratios2 <- collapsed_dfsoc2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/재가급여)) %>%
arrange(desc(logratio))
word_ratios2 %>%
arrange(abs(logratio))
word_ratios2 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/재가급여)") +
scale_fill_discrete(name = "", labels = c("요양시설", "재가급여"))
word_ratios3 <- collapsed_dfsoc2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(재가급여/시설)) %>%
arrange(desc(logratio))
word_ratios3 %>%
arrange(abs(logratio))
word_ratios3 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (재가급여/요양시설)") +
scale_fill_discrete(name = "", labels = c("재가급여", "요양시설"))
## DTM 변환
dfsoc_dtm <- dfsoc_bow %>%
cast_dtm(document = no, term = nouns, value = n) %>%
as.matrix
## LDA 모형 적합
dfsoc_lda <- LDA(dfsoc_dtm, k = 3, method="Gibbs",
control=list(alpha=0.1, delta=0.1, seed=1357))
dfsoc_topics <- tidy(dfsoc_lda, matrix = "beta")
dfsoc_topics
dfsoc_top_terms <- dfsoc_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
dfsoc_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
## 토픽 결과 - 행렬
tidy(dfsoc_lda, matrix="gamma") %>%
arrange(document) %>%
spread(topic, gamma)
library(tidyr)
beta_wide <- dfsoc_topics %>%
mutate(topic = paste0("topic", topic)) %>%
pivot_wider(names_from = topic, values_from = beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_wide
dfsoc_gamma <- tidy(dfsoc_lda, matrix="gamma")
dfsoc_gamma$no=as.numeric(dfsoc_gamma$document)
dfsoc_gamma <- left_join(dfsoc_gamma, dfsoc_tk)
## Joining with `by = join_by(no)`
dfsoc_gamma %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ institute) +
labs(x = "topic", y = expression(gamma))
dfsoc_gamma$social= as.factor(dfsoc_gamma$social)
dfsoc_gamma %>%
mutate(social = reorder(social, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ social) +
labs(x = "topic", y = expression(gamma))
tidy(dfsoc_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .9) | (topic==2 & gamma > .9) |(topic==3 & gamma > .8))
dfsoc_topic_g <- tidy(dfsoc_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .9) | (topic==2 & gamma > .9) |(topic==3 & gamma > .8)) %>%
mutate(topic = as.factor(topic)) %>%
ggplot(aes(x = document, y=gamma)) +
geom_col(aes(fill = topic), position=position_dodge()) +
scale_fill_manual(values = c("orange", "midnightblue", "blue")) +
theme_light() +
labs(title="사회적부담 토픽모형",
subtitle = "문서 토픽 행렬")
dfsoc_topic_g
rm(list=ls())
df <- readxl::read_excel("df5.xlsx")
df<-unite(df, col='dep1', c('dep1', 'dep2'), sep='. ')
sf_dep<-df%>%
select(no, dep1, institute, dependet)
dfdep_tk <- sf_dep %>%
# 문자 혹은 공백 이외 것 제거
mutate(text = str_remove_all(dep1, "[^(\\w+|\\s)]")) %>%
# 메카브로 품사 중 명사만 추출
unnest_tokens(word, text, token = RcppMeCab::pos, drop = FALSE) %>%
separate(col = word,
into = c("word", "morph"),
sep = "/" ) %>%
filter(morph == "nng")
dfdep_tk %>% glimpse()
## Rows: 234
## Columns: 7
## $ no <dbl> 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, …
## $ dep1 <chr> "인지가 있을 때는 자주 찾아와서 가족의 안부를 듣고 싶어 하셨…
## $ institute <chr> "시설", "시설", "시설", "시설", "시설", "시설", "시설", "시…
## $ dependet <chr> "중", "중", "중", "중", "하", "하", "하", "하", "하", "하", …
## $ text <chr> "인지가 있을 때는 자주 찾아와서 가족의 안부를 듣고 싶어 하셨…
## $ word <chr> "인지", "때", "가족", "안부", "병원", "때", "병원", "그때", …
## $ morph <chr> "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng…
dfdep_tk$no <- as.numeric(dfdep_tk$no)
## BOW 데이터 변환
dfdep_bow <- sf_dep %>%
## 문서 내 텍스트에서 명사 추출
mutate(mecab_pos = map(dep1, RcppMeCab::pos) ) %>%
unnest(mecab_pos) %>%
unnest(mecab_pos) %>%
separate(mecab_pos, into = c("nouns", "pos"), sep = "/") %>%
filter(pos == "NNG") %>%
## 문서별 명사 빈도수
group_by(no) %>%
count(nouns, sort = TRUE)
dfdep_bow$no <- as.numeric(dfdep_bow$no)
dfdep2 <- inner_join(dfdep_bow, dfdep_tk)
## Joining with `by = join_by(no)`
collapsed_dfdep2 <- dfdep2 %>%
group_by(nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
collapsed_dfdep2$total <- sum(collapsed_dfdep2$count)
collapsed_dfdep2$freq <- collapsed_dfdep2$count/collapsed_dfdep2$total
freq_dep_all<-collapsed_dfdep2 %>%
filter(count>29)
freq_dep_all <- freq_dep_all[order(-freq_dep_all$count), ]
write_xlsx(freq_dep_all, "freq_dep_all.xlsx")
collapsed_dfdep2 <- dfdep2 %>%
group_by(institute, nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
## `summarise()` has grouped output by 'institute'. You can override using the
## `.groups` argument.
png("dep_plot.png")
dep_plot<- ggplot(data = freq_dep_all, aes(label = nouns, size = count, color = count))+
geom_text_wordcloud_area(area_corr_power = 1, eccentricity = 1)+
scale_size_area(max_size = 20) +
scale_color_gradient(low = "#C7DDEC", high = "#1F78B4") +
theme_minimal()
print(dep_plot)
dev.off()
## png
## 2
frequency <- collapsed_dfdep2 %>%
left_join(collapsed_dfdep2 %>% group_by(institute)%>%
summarize(total=sum(count, na.rm=TRUE))) %>%
mutate(freq = count/total)
## Joining with `by = join_by(institute)`
frequency<- frequency[order(frequency$institute, -frequency$freq), ]
frequency
write_xlsx(freq_dep_all, "freq_dep_by_institute.xlsx")
summary(frequency)
## institute nouns count total
## Length:150 Length:150 Min. : 1.00 Min. : 266
## Class :character Class :character 1st Qu.: 6.00 1st Qu.: 913
## Mode :character Mode :character Median : 12.00 Median : 913
## Mean : 19.69 Mean :1142
## 3rd Qu.: 24.00 3rd Qu.:1774
## Max. :114.00 Max. :1774
## freq
## Min. :0.0005637
## 1st Qu.:0.0076670
## Median :0.0142388
## Mean :0.0200000
## 3rd Qu.:0.0225564
## Max. :0.1248631
summary(frequency$count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.00 12.00 19.69 24.00 114.00
library(ggplot2)
ggplot(frequency, aes(freq, fill = institute)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.20) +
facet_wrap(~institute, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
frequency1_tf_idf <- frequency %>%
bind_tf_idf(nouns, institute, count)
frequency1_tf_idf
frequency1_tf_idf <-frequency1_tf_idf %>%
select(-total) %>%
arrange(institute, desc(tf_idf))
write_xlsx(frequency1_tf_idf, "freq_tf_idf_dep_by_institute.xlsx")
library(forcats)
frequency1_tf_idf %>%
group_by(institute) %>%
slice_max(tf_idf, n = 7) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(nouns, tf_idf), fill = institute)) +
geom_col(show.legend = FALSE) +
facet_wrap(~institute, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
library(tidyr)
table(frequency$institute)
##
## 가정내 시설 재가급여
## 54 32 64
frequency <- frequency %>%
select(institute, nouns, freq) %>%
pivot_wider(names_from = institute, values_from = freq)
frequency
word_ratios <- collapsed_dfdep2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/가정내)) %>%
arrange(desc(logratio))
word_ratios %>%
arrange(abs(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/가정내)") +
scale_fill_discrete(name = "", labels = c("요양시설", "가정내"))
word_ratios2 <- collapsed_dfdep2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/재가급여)) %>%
arrange(desc(logratio))
word_ratios2 %>%
arrange(abs(logratio))
word_ratios2 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/재가급여)") +
scale_fill_discrete(name = "", labels = c("요양시설", "재가급여"))
word_ratios3 <- collapsed_dfdep2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(재가급여/시설)) %>%
arrange(desc(logratio))
word_ratios3 %>%
arrange(abs(logratio))
word_ratios3 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (재가급여/요양시설)") +
scale_fill_discrete(name = "", labels = c("재가급여", "요양시설"))
## DTM 변환
dfdep_dtm <- dfdep_bow %>%
cast_dtm(document = no, term = nouns, value = n) %>%
as.matrix
## LDA 모형 적합
dfdep_lda <- LDA(dfdep_dtm, k = 3, method="Gibbs",
control=list(alpha=0.1, delta=0.1, seed=1357))
dfdep_topics <- tidy(dfdep_lda, matrix = "beta")
dfdep_topics
dfdep_top_terms <- dfdep_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
dfdep_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
## 토픽 결과 - 행렬
tidy(dfdep_lda, matrix="gamma") %>%
arrange(document) %>%
spread(topic, gamma)
library(tidyr)
beta_wide <- dfdep_topics %>%
mutate(topic = paste0("topic", topic)) %>%
pivot_wider(names_from = topic, values_from = beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_wide
dfdep_gamma <- tidy(dfdep_lda, matrix="gamma")
dfdep_gamma$no=as.numeric(dfdep_gamma$document)
dfdep_gamma <- left_join(dfdep_gamma, dfdep_tk)
## Joining with `by = join_by(no)`
dfdep_gamma %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ institute) +
labs(x = "topic", y = expression(gamma))
dfdep_gamma$dependet= as.factor(dfdep_gamma$dependet)
dfdep_gamma %>%
mutate(dependet = reorder(dependet, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ dependet) +
labs(x = "topic", y = expression(gamma))
tidy(dfdep_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .9) | (topic==2 & gamma > .9) |(topic==3 & gamma > .9))
dfdep_topic_g <- tidy(dfdep_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .9) | (topic==2 & gamma > .9) |(topic==3 & gamma > .8)) %>%
mutate(topic = as.factor(topic)) %>%
ggplot(aes(x = document, y=gamma)) +
geom_col(aes(fill = topic), position=position_dodge()) +
scale_fill_manual(values = c("orange", "midnightblue", "blue")) +
theme_light() +
labs(title="의존적부담 토픽모형",
subtitle = "문서 토픽 행렬")
dfdep_topic_g
rm(list=ls())
df <- readxl::read_excel("df5.xlsx")
df<-unite(df, col='emo1', c('emo1', 'emo2'), sep='. ')
df_emo<-df%>%
select(no, emo1, institute, emotional)
dfemo_tk <- df_emo %>%
# 문자 혹은 공백 이외 것 제거
mutate(text = str_remove_all(emo1, "[^(\\w+|\\s)]")) %>%
# 메카브로 품사 중 명사만 추출
unnest_tokens(word, text, token = RcppMeCab::pos, drop = FALSE) %>%
separate(col = word,
into = c("word", "morph"),
sep = "/" ) %>%
filter(morph == "nng")
dfemo_tk %>% glimpse()
## Rows: 374
## Columns: 7
## $ no <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ emo1 <chr> "암만 해도 이제 직장생활을 하기 때문에 네네 꼭 업무가 바쁠 …
## $ institute <chr> "시설", "시설", "시설", "시설", "시설", "시설", "시설", "시…
## $ emotional <chr> "중", "중", "중", "중", "중", "중", "중", "중", "중", "중", …
## $ text <chr> "암만 해도 이제 직장생활을 하기 때문에 네네 꼭 업무가 바쁠 …
## $ word <chr> "직장", "생활", "업무", "때", "상태", "업무", "지장", "심리"…
## $ morph <chr> "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng", "nng…
dfemo_tk$no <- as.numeric(dfemo_tk$no)
## BOW 데이터 변환
dfemo_bow <- df_emo %>%
## 문서 내 텍스트에서 명사 추출
mutate(mecab_pos = map(emo1, RcppMeCab::pos) ) %>%
unnest(mecab_pos) %>%
unnest(mecab_pos) %>%
separate(mecab_pos, into = c("nouns", "pos"), sep = "/") %>%
filter(pos == "NNG") %>%
## 문서별 명사 빈도수
group_by(no) %>%
count(nouns, sort = TRUE)
dfemo_bow$no <- as.numeric(dfemo_bow$no)
dfemo2 <- inner_join(dfemo_bow, dfemo_tk)
## Joining with `by = join_by(no)`
collapsed_dfemo2 <- dfemo2 %>%
group_by(nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
collapsed_dfemo2$total <- sum(collapsed_dfemo2$count)
collapsed_dfemo2$freq <- collapsed_dfemo2$count/collapsed_dfemo2$total
freq_emo_all<-collapsed_dfemo2 %>%
filter(count>29)
freq_emo_all <- freq_emo_all[order(-freq_emo_all$count), ]
write_xlsx(freq_emo_all, "freq_emo_all.xlsx")
collapsed_dfemo2 <- dfemo2 %>%
group_by(institute, nouns) %>%
summarize(count = sum(n, na.rm = TRUE))
## `summarise()` has grouped output by 'institute'. You can override using the
## `.groups` argument.
png("emo_plot.png")
emo_plot<- ggplot(data = freq_emo_all, aes(label = nouns, size = count, color = count))+
geom_text_wordcloud_area(area_corr_power = 1, eccentricity = 1)+
scale_size_area(max_size = 20) +
scale_color_gradient(low = "#C7DDEC", high = "#1F78B4") +
theme_minimal()
print(emo_plot)
dev.off()
## png
## 2
frequency <- collapsed_dfemo2 %>%
left_join(collapsed_dfemo2 %>% group_by(institute)%>%
summarize(total=sum(count, na.rm=TRUE))) %>%
mutate(freq = count/total)
## Joining with `by = join_by(institute)`
frequency<- frequency[order(frequency$institute, -frequency$freq), ]
frequency
write_xlsx(freq_emo_all, "freq_emo_by_institute.xlsx")
summary(frequency)
## institute nouns count total
## Length:220 Length:220 Min. : 2.00 Min. : 944
## Class :character Class :character 1st Qu.: 13.00 1st Qu.:3266
## Mode :character Mode :character Median : 24.00 Median :3266
## Mean : 34.32 Mean :2725
## 3rd Qu.: 38.00 3rd Qu.:3340
## Max. :343.00 Max. :3340
## freq
## Min. :0.0005988
## 1st Qu.:0.0058175
## Median :0.0116350
## Mean :0.0136364
## 3rd Qu.:0.0148305
## Max. :0.1050214
summary(frequency$count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 13.00 24.00 34.32 38.00 343.00
library(ggplot2)
ggplot(frequency, aes(freq, fill = institute)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.20) +
facet_wrap(~institute, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
frequency1_tf_idf <- frequency %>%
bind_tf_idf(nouns, institute, count)
frequency1_tf_idf
frequency1_tf_idf <-frequency1_tf_idf %>%
select(-total) %>%
arrange(institute, desc(tf_idf))
write_xlsx(frequency1_tf_idf, "freq_tf_idf_emo_by_institute.xlsx")
library(forcats)
frequency1_tf_idf %>%
group_by(institute) %>%
slice_max(tf_idf, n = 7) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(nouns, tf_idf), fill = institute)) +
geom_col(show.legend = FALSE) +
facet_wrap(~institute, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
library(tidyr)
table(frequency$institute)
##
## 가정내 시설 재가급여
## 87 54 79
frequency <- frequency %>%
select(institute, nouns, freq) %>%
pivot_wider(names_from = institute, values_from = freq)
frequency
word_ratios <- collapsed_dfemo2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/가정내)) %>%
arrange(desc(logratio))
word_ratios %>%
arrange(abs(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/가정내)") +
scale_fill_discrete(name = "", labels = c("요양시설", "가정내"))
word_ratios2 <- collapsed_dfemo2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(시설/재가급여)) %>%
arrange(desc(logratio))
word_ratios2 %>%
arrange(abs(logratio))
word_ratios2 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (요양시설/재가급여)") +
scale_fill_discrete(name = "", labels = c("요양시설", "재가급여"))
word_ratios3 <- collapsed_dfemo2 %>%
filter(count >= 10) %>%
pivot_wider(names_from = institute, values_from = count, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(재가급여/시설)) %>%
arrange(desc(logratio))
word_ratios3 %>%
arrange(abs(logratio))
word_ratios3 %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 5) %>%
ungroup() %>%
mutate(nouns = reorder(nouns, logratio)) %>%
ggplot(aes(nouns, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (재가급여/요양시설)") +
scale_fill_discrete(name = "", labels = c("재가급여", "요양시설"))
## DTM 변환
dfemo_dtm <- dfemo_bow %>%
cast_dtm(document = no, term = nouns, value = n) %>%
as.matrix
## LDA 모형 적합
dfemo_lda <- LDA(dfemo_dtm, k = 3, method="Gibbs",
control=list(alpha=0.1, delta=0.1, seed=1357))
dfemo_topics <- tidy(dfemo_lda, matrix = "beta")
dfemo_topics
dfemo_top_terms <- dfemo_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
dfemo_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
## 토픽 결과 - 행렬
tidy(dfemo_lda, matrix="gamma") %>%
arrange(document) %>%
spread(topic, gamma)
library(tidyr)
beta_wide <- dfemo_topics %>%
mutate(topic = paste0("topic", topic)) %>%
pivot_wider(names_from = topic, values_from = beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_wide
dfemo_gamma <- tidy(dfemo_lda, matrix="gamma")
dfemo_gamma$no=as.numeric(dfemo_gamma$document)
dfemo_gamma <- left_join(dfemo_gamma, dfemo_tk)
## Joining with `by = join_by(no)`
dfemo_gamma %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ institute) +
labs(x = "topic", y = expression(gamma))
dfemo_gamma$emotional= as.factor(dfemo_gamma$emotional)
dfemo_gamma %>%
mutate(emotional = reorder(emotional, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ emotional) +
labs(x = "topic", y = expression(gamma))
tidy(dfemo_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .9) | (topic==2 & gamma > .9) |(topic==3 & gamma > .9))
dfemo_topic_g <- tidy(dfemo_lda, matrix="gamma") %>%
filter((topic==1 & gamma > .9) | (topic==2 & gamma > .9) |(topic==3 & gamma > .8)) %>%
mutate(topic = as.factor(topic)) %>%
ggplot(aes(x = document, y=gamma)) +
geom_col(aes(fill = topic), position=position_dodge()) +
scale_fill_manual(values = c("orange", "midnightblue", "blue")) +
theme_light() +
labs(title="정서적부담 토픽모형",
subtitle = "문서 토픽 행렬")
dfemo_topic_g