Scrapping: Save text setiap pencarian
Jump to navigation
Jump to search
Simpan konten per URL ke file `.txt`, 1 file per halaman Jadi tiap hasil pencarian disimpan jadi file `.txt` sendiri biar bisa dibaca atau dianalisis dengan mudah.
FITUR-FITUR:
- Baca `keywords.txt` → cari di Google
- Ambil `top-N` hasil pencarian tiap keyword
- Kunjungi tiap URL, ambil judul + isi artikel (5 paragraf pertama)
- Simpan ke folder `outputs/` dalam format `.txt`:
- Nama file: `keyword_rank_judul.txt`
PERSIAPAN:
pip install googlesearch-python requests beautifulsoup4
Buat file `keywords.txt`:
berita teknologi Indonesia politik 2025 game PS5 terbaru
SCRIPT PYTHON SUPER LENGKAP:
import os
import requests
from googlesearch import search
from bs4 import BeautifulSoup
import re
import time
def load_keywords(filename):
with open(filename, 'r', encoding='utf-8') as f:
return [line.strip() for line in f if line.strip()]
def clean_filename(text):
# Hilangkan karakter ilegal untuk nama file
return re.sub(r'[\\/*?:"<>|]', , text).strip().replace(' ', '_')[:50]
def get_page_content(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
response = requests.get(url, headers=headers, timeout=10)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.title.string if soup.title else 'No Title'
paragraphs = soup.find_all('p')
content = '\n\n'.join([p.get_text() for p in paragraphs[:5]])
return title.strip(), content.strip()
except Exception as e:
return 'Error', f"Gagal mengambil konten: {e}"
def save_to_txt(keyword, rank, title, url, content, folder='outputs'):
os.makedirs(folder, exist_ok=True)
filename = f"{clean_filename(keyword)}_{rank}_{clean_filename(title)}.txt"
filepath = os.path.join(folder, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(f"Keyword : {keyword}\n")
f.write(f"Peringkat : {rank}\n")
f.write(f"Judul : {title}\n")
f.write(f"URL : {url}\n\n")
f.write(content)
def scrape_and_save_txt(keywords, num_results=5):
for keyword in keywords:
print(f"\n🔍 Searching: {keyword}")
try:
results = search(keyword, num_results=num_results)
for i, url in enumerate(results):
print(f" → ({i+1}) Fetching: {url}")
title, content = get_page_content(url)
save_to_txt(keyword, i+1, title, url, content)
time.sleep(2)
except Exception as e:
print(f"❌ Error saat mencari '{keyword}': {e}")
print("\n✅ Semua konten telah disimpan di folder 'outputs/'")
# Main
if __name__ == '__main__':
keywords = load_keywords('keywords.txt')
scrape_and_save_txt(keywords, num_results=5)
Output:
Folder `outputs/` akan berisi file seperti:
berita_teknologi_Indonesia_1_Tekno_Terbaru_dari_Tempo.txt politik_2025_2_Media_Indonesia_Politik.txt
Isi filenya:
Keyword : berita teknologi Indonesia Peringkat : 1 Judul : Berita Teknologi Terbaru Hari Ini - Tempo URL : https://tekno.tempo.co/... [ISI PARAGRAF]