Source code for morizon.utils

#!/usr/bin/python
# -*- coding: utf-8 -*-
import logging
from urllib.parse import quote, urlparse

import requests
from bs4 import BeautifulSoup
from scrapper_helpers.utils import caching, get_random_user_agent, key_md5, replace_all

from . import BASE_URL

log = logging.getLogger(__file__)
POLISH_CHARACTERS_MAPPING = {"ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n", "ó": "o", "ś": "s", "ż": "z", "ź": "z"}
POSSIBLE_CATEGORIES = ['mieszkania', 'domy', 'komercyjne', 'dzialki', 'garaze', 'pokoje']
POSSIBLE_TRANSACTIONS = ['do-wynajecia']


[docs]def get_max_page(url):
    """ Reads total page number on Morizon search page

    :param url:  web page url
    :type url: str
    :return: number on sub web pages for search
    :rtype: int
    """
    markup = BeautifulSoup(get_content_from_source(url), 'html.parser')
    last_page = markup.find_all('a', {'class': 'navigate next'})
    if not last_page:
        return 1
    num = last_page[0].previous.previous
    return int(num)


[docs]def encode_text_to_url(text):
    """ Change text to lower cases, gets rid of polish characters replacing them with simplified version,
    replaces spaces with dashes

    :param text: raw text
    :type text: str
    :return: encoded text which can be used in url
    :rtype: str
    """
    replace_dict = POLISH_CHARACTERS_MAPPING
    replace_dict.update({' ': '-'})
    return replace_all(text.lower(), replace_dict)


class URL:
    def __init__(self, category='nieruchomosci', city=None, street=None, transaction_type=None, filters=None):
        self.filters = filters or {}
        self.transaction_type = transaction_type
        self.street = street
        self.city = city
        self.category = category
        self.page = 1

    def get_url(self):
        """ Create Morizon search web page with given parameters

        :param category: type of property of interest (mieszkania/domy/garaże/działki)
        :param city: city
        :param street:  street
        :param transaction_type: type of transaction(sprzedaż/wynajem)
        :param filters: Dictionary with additional filters.
        :type category: str, None
        :type city: str, None
        :type street: str, None
        :type transaction_type: str, None
        :type filters: dict
        :return: url to web page
        :rtype: srt
        """
        url = BASE_URL
        if self.transaction_type:
            url += '/' + self.transaction_type
        url += '/' + encode_text_to_url(self.category)
        if self.city:
            url += '/' + encode_text_to_url(self.city)
        if self.street:
            url += '/' + encode_text_to_url(self.street)
        url += '/?page={0}&'.format(self.page)
        if self.filters and len(self.filters) > 0:
            for param, value in sorted(self.filters.items(), key=lambda item: item[0]):
                url += "ps{0}={1}&".format(quote(param), value)
        return url

    @classmethod
    def from_string(cls, url):
        parsed_url = urlparse(url)
        path_parts = parsed_url.path.split('/')[1:-1]
        transaction, category, city, street = None, 'nieruchomosci', None, None
        for i, path_part in enumerate(path_parts):
            if path_part == 'nieruchomosci':
                pass
            elif path_part in POSSIBLE_TRANSACTIONS:
                transaction = path_part
            elif path_part in POSSIBLE_CATEGORIES:
                category = path_part
            elif not city:
                city = path_part
            else:
                street = path_part

        filters = {}
        query_params = parsed_url.query.split('ps')[1:]
        for i, query_param in enumerate(query_params):
            query_param, value = replace_all(query_param, {'%5B': '[', '%5D': ']', '&': ''}).split('=')
            filters[query_param] = value

        return cls(category, city, street, transaction, filters)

    def next_page(self):
        self.page += 1
        return self

    def max_num_of_pages(self):
        return get_max_page(self.get_url())


@caching(key_func=key_md5)
[docs]def get_content_from_source(url):
    """ Connects with given url

    If environmental variable DEBUG is True it will cache response for url in /var/temp directory

    :param url: Website url
    :type url: str
    :return: Response for requested url
    """
    response = requests.get(url, headers={'User-Agent': get_random_user_agent()})
    try:
        response.raise_for_status()
    except requests.HTTPError as e:
        log.warning('Request for {0} failed. Error: {1}'.format(url, e))
        return None
    return response.content