-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
181 lines (148 loc) · 5.49 KB
/
utils.py
File metadata and controls
181 lines (148 loc) · 5.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Utility functions for Finn.no scraper.
"""
import re
from typing import Dict, Optional
from urllib.parse import urlencode, urljoin, urlparse
from config import BASE_URL, SEARCH_URL, FUEL_TYPES, TRANSMISSION_TYPES
def build_search_url(
page: int = 1,
min_price: Optional[int] = None,
max_price: Optional[int] = None,
min_year: Optional[int] = None,
max_year: Optional[int] = None,
min_mileage: Optional[int] = None,
max_mileage: Optional[int] = None,
fuel_type: Optional[str] = None,
sort_by: str = "PUBLISHED_DESC"
) -> str:
"""Build a Finn.no search URL with filters."""
params = {}
# Add pagination
if page > 1:
params["page"] = page
# Add price filters
if min_price is not None:
params["price_from"] = min_price
if max_price is not None:
params["price_to"] = max_price
# Add year filters
if min_year is not None:
params["year_from"] = min_year
if max_year is not None:
params["year_to"] = max_year
# Add mileage filters
if min_mileage is not None:
params["mileage_from"] = min_mileage
if max_mileage is not None:
params["mileage_to"] = max_mileage
# Add fuel type filter
if fuel_type:
params["fuel"] = fuel_type
# Add sorting
if sort_by:
params["sort"] = sort_by
if params:
query_string = urlencode(params, doseq=True)
return f"{SEARCH_URL}?{query_string}"
return SEARCH_URL
def parse_listing_text(text: str) -> Dict[str, str]:
"""
Parse listing text to extract details.
Example text format:
"Volvo V70 D2 / Momentum / Manuell / Meget pen / Cruise / Dab
D2 / Momentum / Manuell / Meget pen / Cruise / Dab
2014 ∙ 210 000 km ∙ Diesel ∙ Manuell
109 999 kr
Stange ∙ WIIK AUTO AS
Forhandler
1 dg."
"""
result = {
"title": "",
"year": "",
"mileage": "",
"fuel_type": "",
"transmission": "",
"price": "",
"location": "",
"seller_name": "",
"seller_type": "",
"time_posted": ""
}
if not text:
return result
lines = [line.strip() for line in text.split('\n') if line.strip()]
for line in lines:
# Extract price (e.g., "109 999 kr", "265 000 kr")
price_match = re.search(r'([\d\s]+)\s*kr\s*$', line)
if price_match and not result["price"]:
result["price"] = price_match.group(1).replace(" ", " ").strip() + " kr"
continue
# Extract year, mileage, fuel, transmission (e.g., "2014 ∙ 210 000 km ∙ Diesel ∙ Manuell")
if '∙' in line and 'km' in line:
parts = [p.strip() for p in line.split('∙')]
for part in parts:
# Year (4 digits)
if re.match(r'^\d{4}$', part):
result["year"] = part
# Mileage
elif 'km' in part:
result["mileage"] = part
# Fuel type
elif part in FUEL_TYPES or part in ["Bensin", "Diesel", "El", "Hybrid", "Plug-in Bensin", "Plug-in Diesel"]:
result["fuel_type"] = part
# Transmission
elif part in ["Manuell", "Automat"]:
result["transmission"] = part
# Electric range (e.g., "450 km rekkevidde")
elif 'rekkevidde' in part:
result["mileage"] = part if not result["mileage"] else result["mileage"]
continue
# Extract seller type
if line in ["Forhandler", "Privat", "Merkeforhandler"]:
result["seller_type"] = line
continue
# Extract time posted (e.g., "1 dg.", "3 dg.", "Nå", "1 min.")
if re.match(r'^(\d+\s*(dg|min|time|dag)|Nå)\.?$', line):
result["time_posted"] = line
continue
# Extract location and seller name (e.g., "Stange ∙ WIIK AUTO AS")
if '∙' in line and 'km' not in line and 'kr' not in line:
parts = [p.strip() for p in line.split('∙')]
if len(parts) >= 2:
result["location"] = parts[0]
result["seller_name"] = parts[1]
elif len(parts) == 1:
# Could be just location
result["location"] = parts[0]
continue
# First non-empty line is usually the title
for line in lines:
if line and not any(x in line for x in ['kr', '∙', 'Forhandler', 'Privat', 'dg.', 'min.', 'Nå']):
if not line.startswith('Betalt'):
result["title"] = line
break
return result
def extract_listing_id(url: str) -> str:
"""Extract listing ID from Finn.no URL."""
# URL format: https://www.finn.no/mobility/item/436201670
match = re.search(r'/mobility/item/(\d+)', url)
return match.group(1) if match else ""
def make_absolute_url(url: str) -> str:
"""Convert relative URL to absolute URL."""
if url.startswith("http"):
return url
return urljoin(BASE_URL, url)
def clean_text(text: str) -> str:
"""Clean and normalize text."""
if not text:
return ""
# Remove extra whitespace
return " ".join(text.split())
def translate_fuel_type(norwegian: str) -> str:
"""Translate Norwegian fuel type to English."""
return FUEL_TYPES.get(norwegian, norwegian)
def translate_transmission(norwegian: str) -> str:
"""Translate Norwegian transmission type to English."""
return TRANSMISSION_TYPES.get(norwegian, norwegian)