Source code for morizon.offer

#!/usr/bin/python
# -*- coding: utf-8 -*-
import datetime as dt
import json

import re

from bs4 import BeautifulSoup
from scrapper_helpers.utils import replace_all, finder

from morizon.utils import get_content_from_source


@finder(class_='paramIconPrice', many=False)
def get_price_for_offer(item, *args, **kwargs):
    """ Parse price information

    :param item:
    :param args:
    :param kwargs:
    :return: price of offer
    :rtype: float
    """
    price = re.findall(r'\d+,\d+|\d+', item.text)[0]
    return float(price)


@finder(class_='paramIconLivingArea', many=False)
def get_surface_for_offer(item, *args, **kwargs):
    """ Parse surface area of property in offer

    :param item:
    :param args:
    :param kwargs:
    :return: surface area of property
    :rtype: float
    """
    area = item.text
    area = re.findall(r'\d+,\d+|\d+', area)[0]
    area = area.replace(',', '.')
    return float(area)


@finder(class_='paramIconNumberOfRooms', many=False)
def get_rooms_for_offer(item, *args, **kwargs):
    """ Parse number of rooms in property

    :param item:
    :param args:
    :param kwargs:
    :return: number of rooms
    :rtype: int
    """
    if not item:
        return None
    rooms = item.text
    rooms = re.findall(r'\d+', rooms)[0]
    return int(rooms)


@finder(text=re.compile(r'Piętro:'), many=False)
def get_floor_for_offer(item, *args, **kwargs):
    """ Parse floor information

    :param item:
    :param args:
    :param kwargs:
    :return: number of floor
    :rtype: int
    """
    if not item:
        return None
    floor_raw = item.find_parent('tr').find('td').text
    floor_sanitized = replace_all(floor_raw, {'\n': '', ' ': ''}).split('/')[0]
    floor = int(floor_sanitized) if floor_sanitized != 'parter' else 0
    return floor


@finder(class_='breadcrumbs', many=False)
def get_city_for_offer(item, *args, **kwargs):
    """ Parse city information

    :param item:
    :param args:
    :param kwargs:
    :return: name of city
    :rtype: str
    """
    nav = item.text.split('\n\n')
    return replace_all(nav[4], {' ': ''})


@finder(class_='summaryLocation')
def get_street_for_offer(items, *args, **kwargs):
    """ Parse street information

    :param items:
    :param args:
    :param kwargs:
    :return: name of street
    :rtype: str
    """
    soup = items[0]
    street_parts = soup.text.replace('\n\n', '').split('\n')
    return street_parts[-2]


@finder(class_='phone hidden')
def get_phone_for_offer(items, *args, **kwargs):
    """ Parse phone information

    :param items:
    :param args:
    :param kwargs:
    :return: phone number to poster
    :rtype: str
    """
    return items[0].text


@finder(class_=re.compile(r'image\d+'))
def get_images_for_offer(items, *args, **kwargs):
    """ Parse list of images of offer

    :param items:
    :param args:
    :param kwargs:
    :return: list  of images or empty list if  there is no image
    :rtype: list
    """
    return [
        link.img.get('data-original').replace('/91/64/4/', '/1280/768/16/')
        for link in items
    ]


@finder(itemprop='name', many=False)
def get_date_for_offer(item, *args, **kwargs):
    """ Parse date information

    :param item:
    :param args:
    :param kwargs:
    :return: Date of adding offer
    :rtype: int
    """
    date_added = re.findall(r'\d\d-\d\d-\d\d\d\d', item.get('content'))[0]
    date_parts = date_added.split('-')
    date_in_second = int((dt.datetime(int(date_parts[2]), int(date_parts[1]),
                                      int(date_parts[0])) - dt.datetime(1970, 1, 1)).total_seconds())
    return date_in_second


@finder(class_='ownerContact clearfix', many=False)
def get_poster_for_offer(item, *args, **kwargs):
    """ Parse poster name

    :param item:
    :param args:
    :param kwargs:
    :return: name of the poster
    :rtype: str
    """
    poster_header = item.find('strong')
    poster_name = re.findall(r'\w+ \w+|\w+', poster_header.text)[0]
    return poster_name


@finder(class_='description', many=False)
def get_description_for_offer(item, *args, **kwargs):
    """ Parse description od offer

    :param item:
    :param args:
    :param kwargs:
    :return: description
    :rtype: str
    """
    return item.text


@finder(class_='GoogleMap', many=False)
def get_gps_for_offer(item, *args, **kwargs):
    """ Parse latitude and longitude

    :param item:
    :param args:
    :param kwargs:
    :return: tuple with geographical coordinates or None if can't find
    :rtype: tuple, None
    """
    if not item:
        return None
    lat = item.get('data-lat')
    long = item.get('data-long')
    gps = (lat, long)
    return gps


@finder(class_='breadcrumbs', many=False)
def get_voivodeship_for_offer(item, *args, **kwargs):
    """ Parse voivodeship information

    :param item:
    :param args:
    :param kwargs:
    :return: name of voivodeship
    :rtype: str
    """
    nav = item.text.split('\n\n')
    return replace_all(nav[3], {' ': ''})


def get_meta_data(markup):
    data = str(markup).split('__layer.push({"property":')[1]
    end = re.findall(r',"company"|,"event"', data)[0]
    data = data.split(end)[0]
    data = json.loads(data)
    return data


[docs]def get_offer_data(url): """ Parse data from offer page url :param url: web page with offer :type url: str :return: Dictionary with details of an offer :rtype: dict """ markup = BeautifulSoup(get_content_from_source(url), 'html.parser') meta_data = get_meta_data(markup) return { 'id': meta_data.get('id'), 'price': get_price_for_offer(markup), 'surface': get_surface_for_offer(markup), 'rooms': get_rooms_for_offer(markup), 'floor': get_floor_for_offer(markup), 'voivodeship': get_voivodeship_for_offer(markup), 'city': get_city_for_offer(markup), 'street': get_street_for_offer(markup), 'phone': get_phone_for_offer(markup), 'date_added': get_date_for_offer(markup), 'poster_name': get_poster_for_offer(markup), 'gps': get_gps_for_offer(markup), 'description': get_description_for_offer(markup), 'images': get_images_for_offer(markup), 'url': url }