Source code for cabu.utils.link_extractor

# -*- coding: utf-8 -*-

import re

try:
    from bs4 import BeautifulSoup, SoupStrainer
except Exception:  # pragma: no cover
    from BeautifulSoup import BeautifulSoup, SoupStrainer

from cabu.exceptions import LinkExtractorException


[docs]def extract_links(response_content, unique=False, blacklist_domains=[],
                  whitelist_domains=[], regex=None, zen_path=None,
                  blacklist_extensions=[], whitelist_extensions=[]):
    """Extract links from a response content.

    Args:
        response_content (str): The HTML page received in a Response Object.
        unique (bool): A parameter defining if the list can contain duplicates.
                       Defaults to False.
        blacklist_domains (list): List of domains to exclude from the result.
        whitelist_domains (list): List of domains to include from the result.
        regex (list): A regular expression filter on the link.
                      Defaults to None.
        zen_path (list): A selector to restrict the XPath to parse with bs4.

    Returns:
        links (list): A list of extracted and filtered links.
    """

    if any([item in blacklist_domains for item in whitelist_domains]) \
       or any([item in blacklist_extensions for item in whitelist_extensions]):
        raise LinkExtractorException('blacklist_domains and whitelist_domains '
                                     'can`t contain common value(s).')

    soup = BeautifulSoup(
        response_content, "html.parser", parse_only=SoupStrainer('a')
    )
    links = [a.text for a in soup]

    if unique:
        links = list(set(links))

    if regex:
        links = filter_links(links, regex)

    if whitelist_domains:
        for domn in whitelist_domains:
            links = filter_links(links, domn.replace('.', '\.'), include=True)

    if blacklist_domains:
        for domn in blacklist_domains:
            links = filter_links(links, domn.replace('.', '\.'), include=False)

    if whitelist_extensions:
        for ext in whitelist_extensions:
            links = filter_links(links, ext.replace('.', '\.'), include=True)

    if blacklist_extensions:
        for ext in blacklist_extensions:
            links = filter_links(links, ext.replace('.', '\.'), include=False)

    return links


[docs]def filter_links(links, regex, include=True):
    """Filter a list of links based on the given params.

    Args:
        regex (str): A regular expression filter to apply on every links.
        include (bool): Determines if the regex exclude or include matchs.

    Returns:
        links_filtered (list): The list filtered according to the arguments.
    """
    links_filtered = []
    for link in links:
        if re.search(regex, link) and include:  # whitelist case
            links_filtered.append(link)
        elif not re.search(regex, link) and not include:  # blacklist case
            links_filtered.append(link)
    return links_filtered
Source code for cabu.utils.link_extractor

Cabu

Navigation

Related Topics