Difference between revisions of "R: tidytext NASA data"
Jump to navigation
Jump to search
Onnowpurbo (talk | contribs) |
Onnowpurbo (talk | contribs) |
||
(4 intermediate revisions by the same user not shown) | |||
Line 3: | Line 3: | ||
metadata <- fromJSON("https://data.nasa.gov/data.json") | metadata <- fromJSON("https://data.nasa.gov/data.json") | ||
names(metadata$dataset) | names(metadata$dataset) | ||
+ | save(metadata,file="metadata.Rdata") | ||
+ | |||
class(metadata$dataset$title) | class(metadata$dataset$title) | ||
Line 13: | Line 15: | ||
title = metadata$dataset$title) | title = metadata$dataset$title) | ||
nasa_title | nasa_title | ||
+ | save(nasa_title,file="nasa_title.Rdata") | ||
+ | |||
Line 29: | Line 33: | ||
unnest(keyword) | unnest(keyword) | ||
nasa_keyword | nasa_keyword | ||
− | + | save(nasa_desc,file="nasa_desc.Rdata") | |
+ | save(nasa_keyword,file="nasa_keyword.Rdata") | ||
Line 39: | Line 44: | ||
unnest_tokens(word, desc) %>% | unnest_tokens(word, desc) %>% | ||
anti_join(stop_words) | anti_join(stop_words) | ||
− | |||
nasa_title | nasa_title | ||
nasa_desc | nasa_desc | ||
+ | |||
+ | |||
+ | # count & sort | ||
+ | nasa_title %>% | ||
+ | count(word, sort = TRUE) | ||
+ | nasa_desc %>% | ||
+ | count(word, sort = TRUE) | ||
+ | |||
+ | |||
+ | # buang keyword yang tidak ada artinya | ||
+ | my_stopwords <- data_frame(word = c(as.character(1:10), | ||
+ | "v1", "v03", "l2", "l3", "l4", "v5.2.0", | ||
+ | "v003", "v004", "v005", "v006", "v7")) | ||
+ | nasa_title <- nasa_title %>% | ||
+ | anti_join(my_stopwords) | ||
+ | nasa_desc <- nasa_desc %>% | ||
+ | anti_join(my_stopwords) | ||
+ | |||
+ | |||
+ | # supaya OCEAN & Ocean jadi sama | ||
+ | nasa_keyword <- nasa_keyword %>% | ||
+ | mutate(keyword = toupper(keyword)) | ||
+ | |||
+ | |||
+ | # | ||
+ | library(widyr) | ||
+ | title_word_pairs <- nasa_title %>% | ||
+ | pairwise_count(word, id, sort = TRUE, upper = FALSE) | ||
+ | title_word_pairs | ||
+ | |||
+ | # ini bisa gagal karena terlalu besar | ||
+ | desc_word_pairs <- nasa_desc %>% | ||
+ | pairwise_count(word, id, sort = TRUE, upper = FALSE) | ||
+ | desc_word_pairs | ||
+ | |||
+ | |||
+ | library(ggplot2) | ||
+ | library(igraph) | ||
+ | library(ggraph) | ||
+ | set.seed(1234) | ||
+ | title_word_pairs %>% | ||
+ | filter(n >= 250) %>% | ||
+ | graph_from_data_frame() %>% | ||
+ | ggraph(layout = "fr") + | ||
+ | geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") + | ||
+ | geom_node_point(size = 5) + | ||
+ | geom_node_text(aes(label = name), repel = TRUE, | ||
+ | point.padding = unit(0.2, "lines")) + | ||
+ | theme_void() | ||
Latest revision as of 12:25, 8 November 2018
library(jsonlite) metadata <- fromJSON("https://data.nasa.gov/data.json") names(metadata$dataset) save(metadata,file="metadata.Rdata")
class(metadata$dataset$title) class(metadata$dataset$description) class(metadata$dataset$keyword)
# ambil judul2 library(dplyr) nasa_title <- data_frame(id = metadata$dataset$`_id`$`$oid`, title = metadata$dataset$title) nasa_title save(nasa_title,file="nasa_title.Rdata")
# sampling 5 line nasa_desc <- data_frame(id = metadata$dataset$`_id`$`$oid`, desc = metadata$dataset$description) nasa_desc %>% select(desc) %>% sample_n(5)
# cari keyword library(tidyr) nasa_keyword <- data_frame(id = metadata$dataset$`_id`$`$oid`, keyword = metadata$dataset$keyword) %>% unnest(keyword) nasa_keyword save(nasa_desc,file="nasa_desc.Rdata") save(nasa_keyword,file="nasa_keyword.Rdata")
library(tidytext) nasa_title <- nasa_title %>% unnest_tokens(word, title) %>% anti_join(stop_words) nasa_desc <- nasa_desc %>% unnest_tokens(word, desc) %>% anti_join(stop_words) nasa_title nasa_desc
# count & sort nasa_title %>% count(word, sort = TRUE) nasa_desc %>% count(word, sort = TRUE)
# buang keyword yang tidak ada artinya my_stopwords <- data_frame(word = c(as.character(1:10), "v1", "v03", "l2", "l3", "l4", "v5.2.0", "v003", "v004", "v005", "v006", "v7")) nasa_title <- nasa_title %>% anti_join(my_stopwords) nasa_desc <- nasa_desc %>% anti_join(my_stopwords)
# supaya OCEAN & Ocean jadi sama nasa_keyword <- nasa_keyword %>% mutate(keyword = toupper(keyword))
# library(widyr) title_word_pairs <- nasa_title %>% pairwise_count(word, id, sort = TRUE, upper = FALSE) title_word_pairs
# ini bisa gagal karena terlalu besar desc_word_pairs <- nasa_desc %>% pairwise_count(word, id, sort = TRUE, upper = FALSE) desc_word_pairs
library(ggplot2) library(igraph) library(ggraph) set.seed(1234) title_word_pairs %>% filter(n >= 250) %>% graph_from_data_frame() %>% ggraph(layout = "fr") + geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") + geom_node_point(size = 5) + geom_node_text(aes(label = name), repel = TRUE, point.padding = unit(0.2, "lines")) + theme_void()