Difference between revisions of "R: tidytext NASA data"

From OnnoWiki
Jump to navigation Jump to search
 
(3 intermediate revisions by the same user not shown)
Line 3: Line 3:
 
  metadata <- fromJSON("https://data.nasa.gov/data.json")
 
  metadata <- fromJSON("https://data.nasa.gov/data.json")
 
  names(metadata$dataset)
 
  names(metadata$dataset)
 +
save(metadata,file="metadata.Rdata")
 +
  
 
  class(metadata$dataset$title)
 
  class(metadata$dataset$title)
Line 13: Line 15:
 
                           title = metadata$dataset$title)
 
                           title = metadata$dataset$title)
 
  nasa_title
 
  nasa_title
 +
save(nasa_title,file="nasa_title.Rdata")
 +
  
  
Line 29: Line 33:
 
                                   unnest(keyword)
 
                                   unnest(keyword)
 
  nasa_keyword
 
  nasa_keyword
 
+
save(nasa_desc,file="nasa_desc.Rdata")
 +
save(nasa_keyword,file="nasa_keyword.Rdata")
  
  
Line 43: Line 48:
  
  
 
+
# count & sort
 
  nasa_title %>%
 
  nasa_title %>%
 
     count(word, sort = TRUE)
 
     count(word, sort = TRUE)
Line 50: Line 55:
  
  
 +
# buang keyword yang tidak ada artinya
 
  my_stopwords <- data_frame(word = c(as.character(1:10),
 
  my_stopwords <- data_frame(word = c(as.character(1:10),
 
                             "v1", "v03", "l2", "l3", "l4", "v5.2.0",
 
                             "v1", "v03", "l2", "l3", "l4", "v5.2.0",
Line 59: Line 65:
  
  
 +
# supaya OCEAN & Ocean jadi sama
 +
nasa_keyword <- nasa_keyword %>%
 +
      mutate(keyword = toupper(keyword))
 +
 +
 +
#
 +
library(widyr)
 +
title_word_pairs <- nasa_title %>%
 +
      pairwise_count(word, id, sort = TRUE, upper = FALSE)
 +
title_word_pairs
  
 +
# ini bisa gagal karena terlalu besar
 +
desc_word_pairs <- nasa_desc %>%
 +
    pairwise_count(word, id, sort = TRUE, upper = FALSE)
 +
desc_word_pairs
  
  
 +
library(ggplot2)
 +
library(igraph)
 +
library(ggraph)
 +
set.seed(1234)
 +
title_word_pairs %>%
 +
    filter(n >= 250) %>%
 +
        graph_from_data_frame() %>%
 +
            ggraph(layout = "fr") +
 +
                geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
 +
                geom_node_point(size = 5) +
 +
                geom_node_text(aes(label = name), repel = TRUE,
 +
                point.padding = unit(0.2, "lines")) +
 +
                theme_void()
  
  

Latest revision as of 12:25, 8 November 2018

library(jsonlite)
metadata <- fromJSON("https://data.nasa.gov/data.json")
names(metadata$dataset)
save(metadata,file="metadata.Rdata")


class(metadata$dataset$title)
class(metadata$dataset$description)
class(metadata$dataset$keyword)
# ambil judul2
library(dplyr)
nasa_title <- data_frame(id = metadata$dataset$`_id`$`$oid`,
                         title = metadata$dataset$title)
nasa_title
save(nasa_title,file="nasa_title.Rdata")


# sampling 5 line
nasa_desc <- data_frame(id = metadata$dataset$`_id`$`$oid`,
                        desc = metadata$dataset$description)
nasa_desc %>%
   select(desc) %>%
      sample_n(5)


# cari keyword
library(tidyr)
nasa_keyword <- data_frame(id = metadata$dataset$`_id`$`$oid`,
                           keyword = metadata$dataset$keyword) %>%
                                 unnest(keyword)
nasa_keyword
save(nasa_desc,file="nasa_desc.Rdata")
save(nasa_keyword,file="nasa_keyword.Rdata")


library(tidytext)
nasa_title <- nasa_title %>%
    unnest_tokens(word, title) %>%
       anti_join(stop_words)
nasa_desc <- nasa_desc %>%
    unnest_tokens(word, desc) %>%
       anti_join(stop_words)
nasa_title
nasa_desc


# count & sort
nasa_title %>%
    count(word, sort = TRUE)
nasa_desc %>%
    count(word, sort = TRUE)


# buang keyword yang tidak ada artinya
my_stopwords <- data_frame(word = c(as.character(1:10),
                           "v1", "v03", "l2", "l3", "l4", "v5.2.0",
                           "v003", "v004", "v005", "v006", "v7"))
nasa_title <- nasa_title %>%
     anti_join(my_stopwords)
nasa_desc <- nasa_desc %>%
     anti_join(my_stopwords)


# supaya OCEAN & Ocean jadi sama
nasa_keyword <- nasa_keyword %>%
     mutate(keyword = toupper(keyword))


#
library(widyr)
title_word_pairs <- nasa_title %>%
     pairwise_count(word, id, sort = TRUE, upper = FALSE)
title_word_pairs
# ini bisa gagal karena terlalu besar
desc_word_pairs <- nasa_desc %>%
    pairwise_count(word, id, sort = TRUE, upper = FALSE)
desc_word_pairs


library(ggplot2)
library(igraph)
library(ggraph)
set.seed(1234)
title_word_pairs %>%
   filter(n >= 250) %>%
       graph_from_data_frame() %>%
           ggraph(layout = "fr") +
               geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "cyan4") +
               geom_node_point(size = 5) +
               geom_node_text(aes(label = name), repel = TRUE,
               point.padding = unit(0.2, "lines")) +
               theme_void()


Pranala Menarik