Difference between revisions of "Scrapping: Word Cloud, Word Frekuensi"
		
		
		
		
		
		Jump to navigation
		Jump to search
		
				
		
		
	
| Onnowpurbo (talk | contribs)  (Created page with "==✅ Fitur Analisis:== 1. '''Frekuensi kata''' (word count) 2. '''Word cloud''' visual 3. '''Top N kata paling sering muncul''' 4. Opsional: '''hapus stopwords Bahasa Indones...") | Onnowpurbo (talk | contribs)  | ||
| Line 1: | Line 1: | ||
| ==✅ Fitur Analisis:== | ==✅ Fitur Analisis:== | ||
| − | + | # '''Frekuensi kata''' (word count) | |
| − | + | # '''Word cloud''' visual | |
| − | + | # '''Top N kata paling sering muncul''' | |
| − | + | # Opsional: '''hapus stopwords Bahasa Indonesia''' biar hasilnya lebih bersih | |
| ==Install dulu lib tambahan:== | ==Install dulu lib tambahan:== | ||
Latest revision as of 06:34, 29 March 2025
✅ Fitur Analisis:
- Frekuensi kata (word count)
- Word cloud visual
- Top N kata paling sering muncul
- Opsional: hapus stopwords Bahasa Indonesia biar hasilnya lebih bersih
Install dulu lib tambahan:
pip install nltk matplotlib wordcloud
Lalu jalankan ini sekali untuk download stopwords-nya:
import nltk
nltk.download('stopwords')
Tambahan Script Analisis (lanjut dari script sebelumnya):
import os
import string
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
# Buat daftar stopwords Bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))
def bersihkan_teks(teks):
    # Lowercase + hilangkan tanda baca
    teks = teks.lower()
    teks = teks.translate(str.maketrans(, , string.punctuation))
    return teks
def analisis_folder(folder='outputs', top_n=20):
    semua_konten = 
    for filename in os.listdir(folder):
        if filename.endswith('.txt'):
            filepath = os.path.join(folder, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                isi = f.read()
                # Ambil hanya bagian isi, setelah header
                if '\n\n' in isi:
                    isi = isi.split('\n\n', 1)[1]
                semua_konten += isi + ' '
    # Bersihkan teks dan tokenisasi
    teks_bersih = bersihkan_teks(semua_konten)
    kata_kata = teks_bersih.split()
    # Hapus stopwords
    kata_kata = [kata for kata in kata_kata if kata not in stop_words and kata.isalpha()]
    # Hitung frekuensi
    counter = Counter(kata_kata)
    top_kata = counter.most_common(top_n)
    print(f"\n🔍 Top {top_n} Kata Paling Sering Muncul:")
    for kata, jumlah in top_kata:
        print(f"{kata}: {jumlah}")
    # Buat Word Cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(counter)
    # Tampilkan
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud dari Semua Konten')
    plt.tight_layout()
    plt.savefig('wordcloud.png')
    plt.show()
    print("\n✅ Word Cloud disimpan sebagai 'wordcloud.png'")
# Jalankan analisis
if __name__ == '__main__':
    analisis_folder('outputs', top_n=30)
Output:
- Console akan menampilkan 30 kata paling sering muncul
- Word cloud disimpan sebagai: `wordcloud.png`
Hasil bisa dipakai untuk:
- Lihat topik dominan
- Tambahan insight untuk laporan/jurnal
- Bahan untuk visualisasi di presentasi