-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_amazon_images.py
More file actions
25 lines (21 loc) · 1004 Bytes
/
get_amazon_images.py
File metadata and controls
25 lines (21 loc) · 1004 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from playwright.sync_api import sync_playwright
import re
def main():
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
url = "https://www.amazon.com/dp/B0D9R7Q449"
page.goto(url, wait_until="domcontentloaded")
page.wait_for_timeout(3000)
content = page.content()
matches = re.findall(r'https://m\.media-amazon\.com/images/I/([A-Za-z0-9_\-]+)\.(?:jpg|png)', content)
# Filter strings likely to be product images
images = [m for m in matches if len(m) > 8]
# Unique
images = list(dict.fromkeys(images))
print("Found Amazon Image IDs:")
for idx in images[:5]:
print(f"https://m.media-amazon.com/images/I/{idx}._AC_SL1500_.jpg")
browser.close()
if __name__ == "__main__":
main()