Script Python pour automatiser le téléchargement des images des galeries du site d'architecture Baunetz et Baunetz Architekten. Nécessite Python et les librairies Requests et BeautifulSoup (commandes pip
dans le code ci-dessous).
#! python3
import re
import shutil
import requests # "pip install requests" to install package
from bs4 import BeautifulSoup # "pip install beautifulsoup4" to install package
headers = {'User-Agent': 'Mozilla/5.0'}
inputURL = input("Adresse de la page Baunetz à aspirer :")
#Functions
def downloadFile(url):
url = url.replace("\\","")
filename = url.split("/")[-1]
responseImg = requests.get(url, stream=True)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(responseImg.raw, out_file)
del responseImg
def batchDownload(urls):
print (len(urls),"image(s) trouvée(s).")
c = 1
for url in urls:
downloadFile(url)
print("Téléchargement", c, "/", (len(urls)))
c += 1
#Patterns
patternBaunetz = re.compile("^https?://(www.)?baunetz.de/meldungen/([^.]*).html$")
patternBaunetzArchitekten = re.compile("^https?://(www.)?baunetz-architekten.de/([^/]*)/([^/]*)/projekt/([^/]*)$")
# Baunetz
if patternBaunetz.match(inputURL):
response = requests.get(inputURL, headers=headers)
soup = BeautifulSoup(response.text,"html.parser")
scriptTags = soup.find_all("script")
for scriptTag in scriptTags:
s = str(scriptTag)
if s.find("xxlGalerie.xxlimages") >= 0:
urls = re.findall(r"'url': '(https?://[^']*)'",s)
batchDownload(urls)
del response
else:
print("L'adresse ne correspond pas à une page Baunetz.")
#Baunetz Architekten
if patternBaunetzArchitekten.match(inputURL):
urls = []
response = requests.get(inputURL, headers=headers)
soup = BeautifulSoup(response.text,"html.parser")
#cover img
coverImg = soup.select("div.project-detail-image img")
url = coverImg[0].get("data-src")
urls.append(url)
#imgs on landing page
galleryImgs = soup.select("div.project-detail-gallery__image img")
for img in galleryImgs:
url = img.get("data-src")
urls.append(url)
#more imgs link (completes the slideshow)
moreImgs = soup.select("div[data-additional-images]")
s = str(moreImgs[0].get("data-additional-images"))
moreImgsUrls = re.findall(r'"src":"(https:[^"]*)',s)
for moreImgsUrl in moreImgsUrls:
urls.append(moreImgsUrl)
batchDownload(urls)
del response
else:
print("L'adresse ne correspond pas à une page Baunetz Architekten.")
# End
input("Press Enter to close")