Examples¶
Few examples of what you can do with Cabu.
Simple example¶
@app.route('/gizmodo_last_articles_links')
def gizmodo_last_articles():
app.webdriver.get('http://www.gizmodo.com')
articles_links = [i.get_attribute('href') for i in app.webdriver.find_elements_by_css_selector('h1.headline>a')]
return jsonify({
'message': 'Last articles',
'articles': articles_links,
})
Persistence¶
You can persist what you crawl to a database.
@app.route('/store_gizmodo_last_articles_links')
def store_gizmodo_last_articles():
app.webdriver.get('http://www.gizmodo.com')
articles_links = [i.get_attribute('href') for i in app.webdriver.find_elements_by_css_selector('h1.headline>a')]
app.db.links.insertMany(articles_links)
return jsonify({
'message': 'Last articles from Gizmodo',
'nb_articles_inserted': len(articles_links),
})
Export links of a page to S3¶
But also to a S3 bucket :)
@app.route('/export_gizmodo_last_articles_links')
def export_gizmodo_last_articles():
app.webdriver.get('http://www.gizmodo.com')
articles_links = [i.get_attribute('href') for i in app.webdriver.find_elements_by_css_selector('h1.headline>a')]
app.bucket.put('my_links.txt', articles_links)
return jsonify({
'message': 'Last articles from Gizmodo',
'nb_articles_inserted': len(articles_links),
})
More complex scenario¶
@app.route('/cheapest_flight')
def cheapest_flight():
app.webdriver.get('https://www.expedia.com')
homepage = app.webdriver.find_element_by_tag_name('html')
tab_flight = app.webdriver.find_element_by_id('tab-flight-tab')
tab_flight.click()
# Selecting from elements
origin_input = app.webdriver.find_element_by_id('flight-origin')
destination_input = app.webdriver.find_element_by_id('flight-destination')
departure_date_input = app.webdriver.find_element_by_id('flight-departing')
return_date_input = app.webdriver.find_element_by_id('flight-returning')
search_button = app.webdriver.find_element_by_id('search-button')
# Filling infos + validation
origin_input.send_keys('New York, NY (NYC-All Airports)')
destination_input.send_keys('Paris, France (PAR-All Airports)')
return_date_input.clear()
departure_date_input.send_keys('06/01/2016')
return_date_input.click()
return_date_input.clear()
return_date_input.send_keys('09/15/2016')
search_button.click()
WebDriverWait(app.webdriver, 60).until(staleness_of(homepage))
# Scrap the cheapest
flight = {}
try:
flight['departure_time'] = app.webdriver.find_element_by_css_selector('span.departure-time').text + 'm'
flight['arrival_time'] = app.webdriver.find_element_by_css_selector('#flightModule1 span.arrival-time').text + 'm'
flight['airline'] = app.webdriver.find_element_by_css_selector('#flightModule1 div.truncate').text
flight['price'] = app.webdriver.find_element_by_css_selector('#flightModule1 .dollars').text
except NoSuchElementException:
message = 'No results :('
else:
message = 'Results retrieved !'
return jsonify({
'message': message,
'flight': flight,
'search_url': app.webdriver.current_url
})