Difference between revisions of "R: tidytext NASA data"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) (Created page with " library(jsonlite) metadata <- fromJSON("https://data.nasa.gov/data.json") names(metadata$dataset) class(metadata$dataset$title) class(metadata$dataset$description) cla...") |
Onnowpurbo (talk | contribs) |
||
| (7 intermediate revisions by the same user not shown) | |||
| Line 3: | Line 3: | ||
metadata <- fromJSON("https://data.nasa.gov/data.json") | metadata <- fromJSON("https://data.nasa.gov/data.json") | ||
names(metadata$dataset) | names(metadata$dataset) | ||
| + | save(metadata,file="metadata.Rdata") | ||
| + | |||
class(metadata$dataset$title) | class(metadata$dataset$title) | ||
| Line 8: | Line 10: | ||
class(metadata$dataset$keyword) | class(metadata$dataset$keyword) | ||
| + | # ambil judul2 | ||
library(dplyr) | library(dplyr) | ||
nasa_title <- data_frame(id = metadata$dataset$`_id`$`$oid`, | nasa_title <- data_frame(id = metadata$dataset$`_id`$`$oid`, | ||
title = metadata$dataset$title) | title = metadata$dataset$title) | ||
nasa_title | nasa_title | ||
| + | save(nasa_title,file="nasa_title.Rdata") | ||
| + | |||
| + | |||
| + | |||
| + | # sampling 5 line | ||
| + | nasa_desc <- data_frame(id = metadata$dataset$`_id`$`$oid`, | ||
| + | desc = metadata$dataset$description) | ||
| + | nasa_desc %>% | ||
| + | select(desc) %>% | ||
| + | sample_n(5) | ||
| + | |||
| + | |||
| + | # cari keyword | ||
| + | library(tidyr) | ||
| + | nasa_keyword <- data_frame(id = metadata$dataset$`_id`$`$oid`, | ||
| + | keyword = metadata$dataset$keyword) %>% | ||
| + | unnest(keyword) | ||
| + | nasa_keyword | ||
| + | save(nasa_desc,file="nasa_desc.Rdata") | ||
| + | save(nasa_keyword,file="nasa_keyword.Rdata") | ||
| + | |||
| + | |||
| + | library(tidytext) | ||
| + | nasa_title <- nasa_title %>% | ||
| + | unnest_tokens(word, title) %>% | ||
| + | anti_join(stop_words) | ||
| + | nasa_desc <- nasa_desc %>% | ||
| + | unnest_tokens(word, desc) %>% | ||
| + | anti_join(stop_words) | ||
| + | nasa_title | ||
| + | nasa_desc | ||
| + | |||
| + | |||
| + | # count & sort | ||
| + | nasa_title %>% | ||
| + | count(word, sort = TRUE) | ||
| + | nasa_desc %>% | ||
| + | count(word, sort = TRUE) | ||
| + | |||
| + | |||
| + | # buang keyword yang tidak ada artinya | ||
| + | my_stopwords <- data_frame(word = c(as.character(1:10), | ||
| + | "v1", "v03", "l2", "l3", "l4", "v5.2.0", | ||
| + | "v003", "v004", "v005", "v006", "v7")) | ||
| + | nasa_title <- nasa_title %>% | ||
| + | anti_join(my_stopwords) | ||
| + | nasa_desc <- nasa_desc %>% | ||
| + | anti_join(my_stopwords) | ||
| + | |||
| + | |||
| + | # supaya OCEAN & Ocean jadi sama | ||
| + | nasa_keyword <- nasa_keyword %>% | ||
| + | mutate(keyword = toupper(keyword)) | ||
| + | # | ||
| + | library(widyr) | ||
| + | title_word_pairs <- nasa_title %>% | ||
| + | pairwise_count(word, id, sort = TRUE, upper = FALSE) | ||
| + | title_word_pairs | ||
| + | # ini bisa gagal karena terlalu besar | ||
| + | desc_word_pairs <- nasa_desc %>% | ||
| + | pairwise_count(word, id, sort = TRUE, upper = FALSE) | ||
| + | desc_word_pairs | ||
| + | library(ggplot2) | ||
| + | library(igraph) | ||
| + | library(ggraph) | ||
| + | set.seed(1234) | ||
| + | title_word_pairs %>% | ||
| + | filter(n >= 250) %>% | ||
| + | graph_from_data_frame() %>% | ||
| + | ggraph(layout = "fr") + | ||
| + | geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") + | ||
| + | geom_node_point(size = 5) + | ||
| + | geom_node_text(aes(label = name), repel = TRUE, | ||
| + | point.padding = unit(0.2, "lines")) + | ||
| + | theme_void() | ||
Latest revision as of 12:25, 8 November 2018
library(jsonlite)
metadata <- fromJSON("https://data.nasa.gov/data.json")
names(metadata$dataset)
save(metadata,file="metadata.Rdata")
class(metadata$dataset$title) class(metadata$dataset$description) class(metadata$dataset$keyword)
# ambil judul2
library(dplyr)
nasa_title <- data_frame(id = metadata$dataset$`_id`$`$oid`,
title = metadata$dataset$title)
nasa_title
save(nasa_title,file="nasa_title.Rdata")
# sampling 5 line
nasa_desc <- data_frame(id = metadata$dataset$`_id`$`$oid`,
desc = metadata$dataset$description)
nasa_desc %>%
select(desc) %>%
sample_n(5)
# cari keyword
library(tidyr)
nasa_keyword <- data_frame(id = metadata$dataset$`_id`$`$oid`,
keyword = metadata$dataset$keyword) %>%
unnest(keyword)
nasa_keyword
save(nasa_desc,file="nasa_desc.Rdata")
save(nasa_keyword,file="nasa_keyword.Rdata")
library(tidytext)
nasa_title <- nasa_title %>%
unnest_tokens(word, title) %>%
anti_join(stop_words)
nasa_desc <- nasa_desc %>%
unnest_tokens(word, desc) %>%
anti_join(stop_words)
nasa_title
nasa_desc
# count & sort
nasa_title %>%
count(word, sort = TRUE)
nasa_desc %>%
count(word, sort = TRUE)
# buang keyword yang tidak ada artinya
my_stopwords <- data_frame(word = c(as.character(1:10),
"v1", "v03", "l2", "l3", "l4", "v5.2.0",
"v003", "v004", "v005", "v006", "v7"))
nasa_title <- nasa_title %>%
anti_join(my_stopwords)
nasa_desc <- nasa_desc %>%
anti_join(my_stopwords)
# supaya OCEAN & Ocean jadi sama
nasa_keyword <- nasa_keyword %>%
mutate(keyword = toupper(keyword))
#
library(widyr)
title_word_pairs <- nasa_title %>%
pairwise_count(word, id, sort = TRUE, upper = FALSE)
title_word_pairs
# ini bisa gagal karena terlalu besar
desc_word_pairs <- nasa_desc %>%
pairwise_count(word, id, sort = TRUE, upper = FALSE)
desc_word_pairs
library(ggplot2)
library(igraph)
library(ggraph)
set.seed(1234)
title_word_pairs %>%
filter(n >= 250) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
geom_node_point(size = 5) +
geom_node_text(aes(label = name), repel = TRUE,
point.padding = unit(0.2, "lines")) +
theme_void()