Python: search-google.py

From OnnoWiki
Jump to navigation Jump to search
#!/usr/bin/env python2
# -*- coding: utf8 -*-

import sys
import time
import random
import argparse

from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.common.exceptions import NoSuchFrameException
from selenium.webdriver.common.keys import Keys

# If this script no longer fetches any results check the XPath

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', '--search', help='Enter the search term')
    parser.add_argument('-p', '--pages', default='1', help='Enter how many pages to scrape (1 page = 100 results)')
    return parser.parse_args()

def start_browser():
    br = webdriver.Firefox()
    br.implicitly_wait(10)
    return br

def get_ua():
    ua_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0']
    ua = random.choice(ua_list)
    return ua

def scrape_results(br):
    links = br.find_elements_by_xpath("//h3[@class='r']/a[@href]")
    results = []
    for link in links:
        title = link.text.encode('utf8')
        url = link.get_attribute('href')
        title_url = (title, url)
        results.append(title_url)
    return results 

def go_to_page(br, page_num, search_term):
    page_num = page_num - 1
    start_results = page_num * 100
    start_results = str(start_results)
    url = 'https://www.google.com/webhp?#num=100&start='+start_results+'&q='+search_term
    print '[*] Fetching 100 results from page '+str(page_num+1)+' at '+url
    br.get(url)
    time.sleep(2) 

def main():
    args = parse_args()
    br = start_browser()
    if not args.search:
        sys.exit("[!] Enter a term or phrase to search with the -s option: -s 'dan mcinerney'")
    search_term = args.search
    pages = args.pages 

    all_results = []
    for page_num in xrange(int(pages)):
        page_num = page_num+1 # since it starts at 0
        go_to_page(br, page_num, search_term)
        titles_urls = scrape_results(br)
        for title in titles_urls:
            all_results.append(title) 

    for result in all_results:
        title = result[0]
        url = result[1]
        print '[+]', title, '--', url 

main()