#!/usr/bin/python
# -*- coding: utf-8 -*-
import logging
from urllib.parse import quote, urlparse
import requests
from bs4 import BeautifulSoup
from scrapper_helpers.utils import caching, get_random_user_agent, key_md5, replace_all
from . import BASE_URL
log = logging.getLogger(__file__)
POLISH_CHARACTERS_MAPPING = {"ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n", "ó": "o", "ś": "s", "ż": "z", "ź": "z"}
POSSIBLE_CATEGORIES = ['mieszkania', 'domy', 'komercyjne', 'dzialki', 'garaze', 'pokoje']
POSSIBLE_TRANSACTIONS = ['do-wynajecia']
[docs]def get_max_page(url):
""" Reads total page number on Morizon search page
:param url: web page url
:type url: str
:return: number on sub web pages for search
:rtype: int
"""
markup = BeautifulSoup(get_content_from_source(url), 'html.parser')
last_page = markup.find_all('a', {'class': 'navigate next'})
if not last_page:
return 1
num = last_page[0].previous.previous
return int(num)
[docs]def encode_text_to_url(text):
""" Change text to lower cases, gets rid of polish characters replacing them with simplified version,
replaces spaces with dashes
:param text: raw text
:type text: str
:return: encoded text which can be used in url
:rtype: str
"""
replace_dict = POLISH_CHARACTERS_MAPPING
replace_dict.update({' ': '-'})
return replace_all(text.lower(), replace_dict)
class URL:
def __init__(self, category='nieruchomosci', city=None, street=None, transaction_type=None, filters=None):
self.filters = filters or {}
self.transaction_type = transaction_type
self.street = street
self.city = city
self.category = category
self.page = 1
def get_url(self):
""" Create Morizon search web page with given parameters
:param category: type of property of interest (mieszkania/domy/garaże/działki)
:param city: city
:param street: street
:param transaction_type: type of transaction(sprzedaż/wynajem)
:param filters: Dictionary with additional filters.
:type category: str, None
:type city: str, None
:type street: str, None
:type transaction_type: str, None
:type filters: dict
:return: url to web page
:rtype: srt
"""
url = BASE_URL
if self.transaction_type:
url += '/' + self.transaction_type
url += '/' + encode_text_to_url(self.category)
if self.city:
url += '/' + encode_text_to_url(self.city)
if self.street:
url += '/' + encode_text_to_url(self.street)
url += '/?page={0}&'.format(self.page)
if self.filters and len(self.filters) > 0:
for param, value in sorted(self.filters.items(), key=lambda item: item[0]):
url += "ps{0}={1}&".format(quote(param), value)
return url
@classmethod
def from_string(cls, url):
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')[1:-1]
transaction, category, city, street = None, 'nieruchomosci', None, None
for i, path_part in enumerate(path_parts):
if path_part == 'nieruchomosci':
pass
elif path_part in POSSIBLE_TRANSACTIONS:
transaction = path_part
elif path_part in POSSIBLE_CATEGORIES:
category = path_part
elif not city:
city = path_part
else:
street = path_part
filters = {}
query_params = parsed_url.query.split('ps')[1:]
for i, query_param in enumerate(query_params):
query_param, value = replace_all(query_param, {'%5B': '[', '%5D': ']', '&': ''}).split('=')
filters[query_param] = value
return cls(category, city, street, transaction, filters)
def next_page(self):
self.page += 1
return self
def max_num_of_pages(self):
return get_max_page(self.get_url())
@caching(key_func=key_md5)
[docs]def get_content_from_source(url):
""" Connects with given url
If environmental variable DEBUG is True it will cache response for url in /var/temp directory
:param url: Website url
:type url: str
:return: Response for requested url
"""
response = requests.get(url, headers={'User-Agent': get_random_user_agent()})
try:
response.raise_for_status()
except requests.HTTPError as e:
log.warning('Request for {0} failed. Error: {1}'.format(url, e))
return None
return response.content