Source code for chicken_turtle_util.http

# Copyright (C) 2015, 2016 VIB/BEG/UGent - Tim Diels <timdiels.m@gmail.com>
#
# This file is part of Chicken Turtle Util.
#
# Chicken Turtle Util is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Chicken Turtle Util is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Chicken Turtle Util.  If not, see <http://www.gnu.org/licenses/>.

'''
HTTP utilities. Contains only `download`, download a http resource.
'''

from urllib.parse import urlparse
from pathlib import Path
import requests
import re

#TODO should pass in a file/directory to store it. Gives flexibility to use a
#temp dir and wouldn't require this function to cleanup either, that's the
#responsibility of the caller

#TODO rm in favor of https://docs.python.org/3.0/library/urllib.request.html#urllib.request.urlretrieve
# or rewrite this to make use of it instead. Its returned filename has the right extension, just not the right name. Sometimes that's all you need
# It already nicely raises on 404

# Based on http://stackoverflow.com/a/16696317/1031434
[docs]def download(url, destination):
    '''
    Download an HTTP resource to a file

    Parameters
    ----------
    url : str
        HTTP resource to download
    destination : pathlib.Path
        Location at which to store downloaded resource. If `destination` does
        not exist, it's assumed to be a file path. If `destination` exists and
        is a file, it is overwritten. If `destination` exists and is a
        directory, the file will be saved inside the directory with as name the
        file name suggested by a server, if any, or the last part of the URL
        otherwise (excluding query and fragment parts).

    Returns
    -------
    path : pathlib.Path
        Path to the downloaded file.
    name : str or None
        File name suggested by the server or None if none was suggested.
    '''
    response = requests.get(url, stream=True)

    # Get file name suggested by server
    file_name = None
    if 'content-disposition' in response.headers:
        match = re.search(r'filename="?([^"]+)"?', response.headers['content-disposition'])
        if match:
            file_name = match.group(1)

    # Ensure destination is a file
    if destination.is_dir():
        if file_name:
            destination /= file_name
        else:
            name = Path(urlparse(url).path).name
            if name:
                destination /= name
            else:
                destination /= 'unknown' 

    # Download
    with destination.open('wb') as f:
        for chunk in response.iter_content(chunk_size=1024): 
            if chunk: # filter out keep-alive chunks
                f.write(chunk)
    return destination, file_name