Telegram Bot Scraping Apartments for Sale
Radzion Chachura
Posted on January 8, 2023
I'm looking for a new apartment, but I don't want to waste time checking the real estate website. Hopefully, we can code! So let's write a TypeScript program that will run on AWS Lambda every hour, scrap new apartments, and send them to a Telegram chat.
Let's set up AWS infrastructure with Terraform. We start with an S3 bucket that will store the code for the lambda. Then we create a DynamoDB table for keeping the state like a list of apartments ids we have already seen. The lambda will receive configuration through environment variables. It includes a Sentry key for reporting, a telegram token with a chat id to send apartments, and the name of the DynamoDB table. To run the lambda every hour, we'll use CloudWatch events.
resource "aws_s3_bucket" "lambda_storage" {
bucket = "${var.name}-storage"
}
data "archive_file" "local_zipped_lambda" {
type = "zip"
source_dir = "${path.module}/lambda"
output_path = "${path.module}/lambda.zip"
}
resource "aws_s3_object" "zipped_lambda" {
bucket = aws_s3_bucket.lambda_storage.bucket
key = "lambda.zip"
source = data.archive_file.local_zipped_lambda.output_path
}
resource "aws_dynamodb_table" "state" {
name = "${var.name}-state"
billing_mode = "PAY_PER_REQUEST"
hash_key = "id"
attribute {
name = "id"
type = "S"
}
}
resource "aws_iam_role" "service" {
name = var.name
assume_role_policy = jsonencode(
{
Version = "2012-10-17",
Statement = [
{
Action = "sts:AssumeRole",
Principal = {
Service = "lambda.amazonaws.com"
},
Effect = "Allow",
Sid = ""
}
]
}
)
}
resource "aws_iam_role_policy_attachment" "service" {
role = aws_iam_role.service.name
policy_arn = aws_iam_policy.service.arn
}
resource "aws_lambda_function" "service" {
function_name = var.name
s3_bucket = aws_s3_bucket.lambda_storage.bucket
s3_key = "lambda.zip"
memory_size = "1024"
handler = "index.handler"
runtime = "nodejs16.x"
timeout = "50"
role = aws_iam_role.service.arn
environment {
variables = {
SENTRY_KEY = var.sentry_key
TELEGRAM_BOT_TOKEN = var.telegram_bot_token
TELEGRAM_BOT_CHAT_ID = var.telegram_bot_chat_id
STATE_TABLE_NAME = aws_dynamodb_table.state.name
}
}
}
resource "aws_iam_policy" "service" {
name = var.name
path = "/"
policy = jsonencode(
{
Version = "2012-10-17",
Statement = [
{
Action = [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
Resource = "arn:aws:logs:*:*:*",
Effect = "Allow"
},
{
Action = "dynamodb:*",
Resource = "${aws_dynamodb_table.state.arn}",
Effect = "Allow"
}
]
})
}
resource "aws_cloudwatch_event_rule" "lambda" {
name = var.name
schedule_expression = "rate(1 hour)"
}
resource "aws_cloudwatch_event_target" "lambda" {
rule = aws_cloudwatch_event_rule.lambda.name
target_id = var.name
arn = aws_lambda_function.service.arn
}
resource "aws_lambda_permission" "lambda_cloudwatch" {
statement_id = "AllowExecutionFromCloudWatch"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.service.function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.lambda.arn
}
At the entry point of our lambda, we set up Sentry for error handling and export the handler function. Here we get new real estate and send them to a telegram chat.
import * as Sentry from "@sentry/serverless"
import { assertEnvVar } from "./utils/assertEnvVar"
import { findNewRealEstate } from "./findNewRealEstate"
Sentry.AWSLambda.init({
dsn: assertEnvVar("SENTRY_KEY"),
autoSessionTracking: false,
})
export const handler = Sentry.AWSLambda.wrapHandler(findNewRealEstate)
import { getNewRealEstate } from "./sources/myHomeGe"
import { tellAboutUnits } from "./tellAboutUnits"
export const findNewRealEstate = async () => {
const units = await getNewRealEstate()
await tellAboutUnits(units)
}
We have only one scrapper, but we can expand the app by coding the getNewRealEstate
function for other marketplaces. First, we create an instance of a state provider. It uses DynamoDB to store the data of a given website and provides two methods, one to get the state and another to update. The state keeps the timestamp of the last visit and the ids of apartments we've seen already.
import { defaultSourceState, SourceState } from "./SourceState"
import { assertEnvVar } from "./utils/assertEnvVar"
import { DynamoDB } from "aws-sdk"
import { getUpdateParams } from "./shared/db/getUpdateParams"
const documentClient = new DynamoDB.DocumentClient()
const tableName = assertEnvVar("STATE_TABLE_NAME")
export class StateProvider {
readonly name: string
constructor(name: string) {
this.name = name
}
async get() {
const { Item } = await documentClient
.get({
TableName: tableName,
Key: { id: this.name },
})
.promise()
return (Item || defaultSourceState) as SourceState
}
async update(params: Partial<SourceState>) {
await documentClient
.update({
TableName: tableName,
Key: { id: this.name },
...getUpdateParams(params),
})
.promise()
}
}
import fetch from "node-fetch"
import { load } from "cheerio"
import { Unit } from "../Unit"
import { StateProvider } from "../StateProvider"
const msInDay = 86400000
const sourceName = "myhome.ge"
const realEstateSearchPage = `https://www.myhome.ge/en/s/Apartment-for-sale-Tbilisi?Keyword=Tbilisi&AdTypeID=1&PrTypeID=1&mapC=41.70931%2C44.78487&mapZ=12&mapOp=1&EnableMap=0®ions=687586034.689678147.689701920&districts=2022621279.906139527.1650325628.2185664.5965823289.798496409&cities=1996871&GID=1996871&FCurrencyID=1&FPriceTo=110000&AreaSizeFrom=70&FloorNums=notlast.notfirst&BedRoomNums=2.3&action_map=on&RenovationID=1.5.7`
const getUnitsFromPage = (body: string) => {
const $ = load(body)
const year = new Date().getFullYear()
const cards = $(".statement-card")
.filter(":not(.banner)")
.filter(":not(..ado_ban)")
return cards
.toArray()
.map((card) => {
const $card = load(card)
const [rawId, rawDate] = $card(".d-block")
.toArray()
.map((el) => $(el).text())
if (!rawId || !rawDate) return
const [day, monthString, time] = rawDate.split(" ")
const rawDateWithYear = [day, monthString, year, time].join(" ")
const id = rawId.split(" ")[1]
const url = $card("a:first").attr("href")
if (!url) return
return {
url,
id,
createdAt: new Date(rawDateWithYear).getTime(),
}
})
.filter((unit) => unit) as Unit[]
}
const getUnits = async (lastVisitAt: number) => {
const recursive = async (units: Unit[], page: number): Promise<Unit[]> => {
const response = await fetch(`${realEstateSearchPage}&Page=${page}`)
const body = await response.text()
const newUnits = getUnitsFromPage(body).filter(
(unit) => unit.createdAt > lastVisitAt
)
if (newUnits.length < 1) return units
return recursive([...units, ...newUnits], page + 1)
}
return await recursive([], 1)
}
export const getNewRealEstate = async (): Promise<Unit[]> => {
const stateProvider = new StateProvider(sourceName)
const state = await stateProvider.get()
const units = (
await getUnits(state.lastVisitAt || Date.now() - msInDay * 2)
).filter((a) => !state.shown.includes(a.id))
await stateProvider.update({
lastVisitAt: Date.now(),
shown: [...state.shown, ...units.map((unit) => unit.id)],
})
return units
}
Once we have the last visit date, we want to get all units posted since that time and filter them to skip already shown ones. After that, we update the state and return these units.
The website has pagination, so we'll use a recursive function that receives units and the page number. First, we fetch the page, then take the body and send it to the scrapper function. If there are no new apartments, we exit the recursion.
We scrap the page with the cheerio library that provides the same API as jQuery. The function takes all the cards and converts them to the Unit type with URL, id, and creation date.
After we've collected new apartments, we want to send them to a Telegram chat. Here we get token and chat id from environment variables and send each apartment as a new message.
import { Unit } from "./Unit"
import { assertEnvVar } from "./utils/assertEnvVar"
import TelegramBot from "node-telegram-bot-api"
export const tellAboutUnits = async (units: Unit[]) => {
const telegramBotToken = assertEnvVar("TELEGRAM_BOT_TOKEN")
const telegramChatId = assertEnvVar("TELEGRAM_BOT_CHAT_ID")
const bot = new TelegramBot(telegramBotToken)
await Promise.all(
units.map(({ url }) => bot.sendMessage(telegramChatId, url))
)
}
Posted on January 8, 2023
Join Our Newsletter. No Spam, Only the good stuff.
Sign up to receive the latest update from our blog.