Difference between revisions of "R: tidytext NASA data"
		
		
		
		
		
		Jump to navigation
		Jump to search
		
				
		
		
	
Onnowpurbo (talk | contribs)  | 
				Onnowpurbo (talk | contribs)   | 
				||
| (3 intermediate revisions by the same user not shown) | |||
| Line 3: | Line 3: | ||
  metadata <- fromJSON("https://data.nasa.gov/data.json")  |   metadata <- fromJSON("https://data.nasa.gov/data.json")  | ||
  names(metadata$dataset)  |   names(metadata$dataset)  | ||
| + |  save(metadata,file="metadata.Rdata")  | ||
| + | |||
  class(metadata$dataset$title)  |   class(metadata$dataset$title)  | ||
| Line 13: | Line 15: | ||
                           title = metadata$dataset$title)  |                            title = metadata$dataset$title)  | ||
  nasa_title  |   nasa_title  | ||
| + |  save(nasa_title,file="nasa_title.Rdata")  | ||
| + | |||
| Line 29: | Line 33: | ||
                                   unnest(keyword)  |                                    unnest(keyword)  | ||
  nasa_keyword  |   nasa_keyword  | ||
| − | + |  save(nasa_desc,file="nasa_desc.Rdata")  | |
| + |  save(nasa_keyword,file="nasa_keyword.Rdata")  | ||
| Line 43: | Line 48: | ||
| − | + |  # count & sort  | |
  nasa_title %>%  |   nasa_title %>%  | ||
      count(word, sort = TRUE)  |       count(word, sort = TRUE)  | ||
| Line 50: | Line 55: | ||
| + |  # buang keyword yang tidak ada artinya  | ||
  my_stopwords <- data_frame(word = c(as.character(1:10),  |   my_stopwords <- data_frame(word = c(as.character(1:10),  | ||
                             "v1", "v03", "l2", "l3", "l4", "v5.2.0",  |                              "v1", "v03", "l2", "l3", "l4", "v5.2.0",  | ||
| Line 59: | Line 65: | ||
| + |  # supaya OCEAN & Ocean jadi sama  | ||
| + |  nasa_keyword <- nasa_keyword %>%  | ||
| + |       mutate(keyword = toupper(keyword))  | ||
| + | |||
| + | |||
| + |  #  | ||
| + |  library(widyr)  | ||
| + |  title_word_pairs <- nasa_title %>%  | ||
| + |       pairwise_count(word, id, sort = TRUE, upper = FALSE)  | ||
| + |  title_word_pairs  | ||
| + |  # ini bisa gagal karena terlalu besar  | ||
| + |  desc_word_pairs <- nasa_desc %>%  | ||
| + |      pairwise_count(word, id, sort = TRUE, upper = FALSE)  | ||
| + |  desc_word_pairs  | ||
| + |  library(ggplot2)  | ||
| + |  library(igraph)  | ||
| + |  library(ggraph)  | ||
| + |  set.seed(1234)  | ||
| + |  title_word_pairs %>%  | ||
| + |     filter(n >= 250) %>%  | ||
| + |         graph_from_data_frame() %>%  | ||
| + |             ggraph(layout = "fr") +  | ||
| + |                 geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +  | ||
| + |                 geom_node_point(size = 5) +  | ||
| + |                 geom_node_text(aes(label = name), repel = TRUE,  | ||
| + |                 point.padding = unit(0.2, "lines")) +  | ||
| + |                 theme_void()  | ||
Latest revision as of 12:25, 8 November 2018
library(jsonlite)
metadata <- fromJSON("https://data.nasa.gov/data.json")
names(metadata$dataset)
save(metadata,file="metadata.Rdata")
class(metadata$dataset$title) class(metadata$dataset$description) class(metadata$dataset$keyword)
# ambil judul2
library(dplyr)
nasa_title <- data_frame(id = metadata$dataset$`_id`$`$oid`,
                         title = metadata$dataset$title)
nasa_title
save(nasa_title,file="nasa_title.Rdata")
# sampling 5 line
nasa_desc <- data_frame(id = metadata$dataset$`_id`$`$oid`,
                        desc = metadata$dataset$description)
nasa_desc %>%
   select(desc) %>%
      sample_n(5)
# cari keyword
library(tidyr)
nasa_keyword <- data_frame(id = metadata$dataset$`_id`$`$oid`,
                           keyword = metadata$dataset$keyword) %>%
                                 unnest(keyword)
nasa_keyword
save(nasa_desc,file="nasa_desc.Rdata")
save(nasa_keyword,file="nasa_keyword.Rdata")
library(tidytext)
nasa_title <- nasa_title %>%
    unnest_tokens(word, title) %>%
       anti_join(stop_words)
nasa_desc <- nasa_desc %>%
    unnest_tokens(word, desc) %>%
       anti_join(stop_words)
nasa_title
nasa_desc
# count & sort
nasa_title %>%
    count(word, sort = TRUE)
nasa_desc %>%
    count(word, sort = TRUE)
# buang keyword yang tidak ada artinya
my_stopwords <- data_frame(word = c(as.character(1:10),
                           "v1", "v03", "l2", "l3", "l4", "v5.2.0",
                           "v003", "v004", "v005", "v006", "v7"))
nasa_title <- nasa_title %>%
     anti_join(my_stopwords)
nasa_desc <- nasa_desc %>%
     anti_join(my_stopwords)
# supaya OCEAN & Ocean jadi sama
nasa_keyword <- nasa_keyword %>%
     mutate(keyword = toupper(keyword))
#
library(widyr)
title_word_pairs <- nasa_title %>%
     pairwise_count(word, id, sort = TRUE, upper = FALSE)
title_word_pairs
# ini bisa gagal karena terlalu besar
desc_word_pairs <- nasa_desc %>%
    pairwise_count(word, id, sort = TRUE, upper = FALSE)
desc_word_pairs
library(ggplot2)
library(igraph)
library(ggraph)
set.seed(1234)
title_word_pairs %>%
   filter(n >= 250) %>%
       graph_from_data_frame() %>%
           ggraph(layout = "fr") +
               geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
               geom_node_point(size = 5) +
               geom_node_text(aes(label = name), repel = TRUE,
               point.padding = unit(0.2, "lines")) +
               theme_void()