Script Python pour aspirer les documents numérisés de la Zentral- und Landesbibliothek de Berlin. Extrait les pages en JPG et assemble un PDF. Les URL soumis doivent être du type : https://digital.zlb.de/viewer/image/16054086_1911/12/
, c'est à dire l'adresse quand vous lisez une page depuis la visionneuse.
import os
import re
from PIL import Image # "pip install Pillow" to install package
import requests # "pip install requests" to install package
import shutil
import mimetypes
from bs4 import BeautifulSoup # "pip install beautifulsoup4" to install package
headers = {'User-Agent': 'Mozilla/5.0'}
# functions
def downloadFile(url, fileName):
url = url.replace("\\","")
response = requests.get(url, stream=True)
content_type = response.headers['content-type']
extension = mimetypes.guess_extension(content_type)
fileName += extension
with open(fileName, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
def getRecordName(url):
reg = re.match(r"https?:\/\/digital.zlb.de\/viewer\/image\/([A-Za-z0-9_]*)\/", url)
if reg:
return reg.group(1)
else:
print("Erreur : format d'URL incorrect. Exemple correct https://digital.zlb.de/viewer/image/16054086_1911/9/LOG_0006/")
return None
def getMaxPagesFromRecord(record):
url = "https://digital.zlb.de/viewer/image/"+record+"/"
response = requests.get(url, headers=headers)
if response:
reg = re.search(r"rft\.tpages=(\d+)&", str(response.text))
if reg:
return int(reg.group(1))
else:
print("Impossible de trouver le nombre maximal de page automatiquement.")
return int(input("Nombre de pages à extraire :"))
else:
print("Erreur : la page ZLB n'est pas chargeable.")
return None
def findImageFromUrl(url,resolution):
response = requests.get(url, headers=headers)
if response:
soup = BeautifulSoup(response.text,"html.parser")
metaImage = soup.findAll("meta", {"property" : "og:image"})
img = metaImage[0].get("content")
img = img.replace("/300,/","/"+str(resolution)+",/")
return img
else:
print("Erreur : la page ZLB n'est pas chargeable.")
#input
inputUrl = input("Adresse de la page ZLB à aspirer :")
resolution = int(input("Résolution (1300 recommandé) :"))
#get record infos
record = getRecordName(inputUrl)
if record:
maxPages = getMaxPagesFromRecord(record)
#download pages as JPG
if record and maxPages:
i = 1
while i < (maxPages+1):
print("Page",i,"sur",maxPages)
url = "https://digital.zlb.de/viewer/image/"+record+"/"+str(i)+"/"
img = findImageFromUrl(url,resolution)
downloadFile(img, record+"_"+str(i).zfill(8))
i += 1
#assemble PDF
filesInFolder = os.listdir()
imagesInFolder = []
imagesRGBInFolder = []
for file in filesInFolder:
ext = os.path.splitext(file)[1]
imageExtensions = [".jpg",".png",".gif",".jpeg",".svg"]
if ext in imageExtensions:
imagesInFolder.append(file)
img = Image.open(file)
if img.mode == "RGBA":
img = img.convert(RGB)
imagesRGBInFolder.append(img)
if imagesRGBInFolder:
imagesRGBInFolder[0].save(record+".pdf", save_all=True, quality=85, append_images=imagesRGBInFolder[1:])
#delete JPG
for file in imagesInFolder:
os.remove(file)
Pour éviter des erreurs de mémoire avec les documents de beaucoup de pages, il faut utiliser une installation 64bits de Python.