#!/usr/bin/env python3
# vim: set ts=4 sw=4 et: coding=UTF-8

#
# Copyright (c) 2008-2009, Novell, Inc.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
# USA
#
# (Licensed under the LGPLv2.1 or later)
#
# Parts of this code comes from convert-to-tarball.py (in the releng GNOME
# svn module), which has the same license.
#
#
# Authors: Vincent Untz <vuntz@opensuse.org>
#

import os
import socket
import sys
import time

import ftplib
import hashlib
import optparse
from posixpath import join as posixjoin # Handy for URLs
import re
from sgmllib import SGMLParser
import shutil
import tempfile
import urllib.request, urllib.parse, urllib.error
import urllib.request, urllib.error, urllib.parse
import urllib.parse

import queue
import threading

try:
    from lxml import etree as ET
except ImportError:
    try:
        from xml.etree import cElementTree as ET
    except ImportError:
        import cElementTree as ET

import feedparser

from util import *

USE_DEBUG = False
MAX_THREADS = 10
URL_HASH_ALGO = 'md5'


#######################################################################


def _line_is_comment(line):
    return line.strip() == '' or line[0] == '#'


#######################################################################


# Fix some locations to point to what are really downloads.
def _location_fix(location):
    return location


#######################################################################


class UpstreamRawDownloadError(Exception):
    def __init__(self, value):
        self.msg = value

    def __str__(self):
        return repr(self.msg)


class UpstreamDownloadError(Exception):
    def __init__(self, value):
        self.msg = value

    def __str__(self):
        return repr(self.msg)


#######################################################################


# comes from convert-to-tarball.py
class urllister(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.urls = []

    def start_a(self, attrs):
        href = [v for k, v in attrs if k=='href']
        if href:
            self.urls.extend(href)


#######################################################################


class svnurllister(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.urls = []

    def start_file(self, attrs):
        href = [v for k, v in attrs if k=='href']
        if href:
            self.urls.extend(href)


#######################################################################


def _get_cache_paths_from_url(url, cache_dir):
    if cache_dir:
        hash = hashlib.new(URL_HASH_ALGO)
        hash.update(url)
        digest = hash.hexdigest()
        cache = os.path.join(cache_dir, digest)
        cache_error = cache + '.error'
        return (cache, cache_error)
    else:
        return (None, None)


#######################################################################


# based on code from convert-to-tarball.py
def _get_files_from_http(url, cache_dir):
    (cache, cache_error) = _get_cache_paths_from_url (url, cache_dir)

    if cache_error and os.path.exists(cache_error):
        raise UpstreamRawDownloadError(open(cache_error).read())
    elif cache and os.path.exists(cache):
        fin = open(cache)
    else:
        obj = urllib.request.build_opener()
        fin = obj.open(url)

    # Get the files
    parser = urllister()
    parser.feed(fin.read())
    fin.close()
    parser.close()
    files = parser.urls

    return (url, files)


#######################################################################


# based on code from convert-to-tarball.py
def _get_files_from_ftp(url):
    parsed_url = urllib.parse.urlparse(url)

    ftp = ftplib.FTP(parsed_url.hostname)
    ftp.login(parsed_url.username or 'anonymous', parsed_url.password or '')
    ftp.cwd(parsed_url.path)
    files = ftp.nlst()
    ftp.quit()

    return (url, files)


#######################################################################


# based on code from convert-to-tarball.py
def _get_files_from_subdir_http(url, limit):
    obj = urllib.request.build_opener()
    # Note that we accept directories called 1.x.X
    good_dir = re.compile('^(([0-9]+|[xX])\.)*([0-9]+|[xX])/?$')
    def hasdirs(x): return good_dir.search(x)
    def fixdirs(x): return re.sub(r'^((?:(?:[0-9]+|[xX])\.)*)([0-9]+|[xX])/?$', r'\1\2', x)
    location = url
    # Follow 302 codes when retrieving URLs, speeds up conversion by 60sec
    redirect_location = location
    while True:
        # Get the files
        usock = obj.open(redirect_location)
        parser = urllister()
        parser.feed(usock.read())
        usock.close()
        parser.close()
        files = parser.urls

        # Check to see if we need to descend to a subdirectory
        newdirs = list(filter(hasdirs, files))
        newdirs = list(map(fixdirs, newdirs))
        if newdirs:
            newdir = _get_latest_version(newdirs, limit)
            # This is a weird case, that we handled for compiz-fusion:
            # if the dir contains a subdir with the same name, then we stop
            # FIXME: make this an option in the metadata?
            if newdir == os.path.basename(redirect_location):
                break
            if not newdir:
                break

            # Add trailing slash since we work on directories, and some servers
            # don't work if we don't add the trailing slash (like lighttpd on
            # http://download.banshee.fm/banshee-community-extensions/)
            if newdir[-1] != '/':
                newdir += '/'

            redirect_location = posixjoin(usock.url, newdir)
            location = posixjoin(location, newdir)
        else:
            break

    return (location, files)


#######################################################################


def _get_files_from_svn(url):
    obj = urllib.request.build_opener()

    # Get the files
    usock = obj.open(url)
    parser = svnurllister()
    parser.feed(usock.read())
    usock.close()
    parser.close()
    files = parser.urls

    return (url, files)


#######################################################################


def _setup_limit(limit):
    limit_data = None

    if not limit:
        pass
    elif limit == 'no-odd-unstable':
        pass
    elif limit[:4] == 'max|':
        limit_data = limit[4:]
        limit = 'max'
    elif limit[:5] == 'skip|':
        limit_data = limit[5:]
        limit = 'skip'
    else:
        print('Unsupported limit: %s' % limit, file=sys.stderr)
        limit = None

    return (limit, limit_data)

def _respect_limit(version, limit, limit_data):
    if not limit:
        return True
    elif limit == 'no-odd-unstable':
        # remove the part after dashes. Eg, in 3.3-1, we're only interested in
        # the 3.3 part.
        version = version.split('-')[0]
        split_version = version.split('.')
        if len(split_version) <= 1:
            # not enough version data, so let's just say yes
            return True

        try:
            return int(split_version[1]) % 2 == 0
        except:
            # second element is not an int. Let's just say yes
            return True

    elif limit == 'max':
        return version_gt(limit_data, version)

    elif limit == 'skip':
        skip_versions = [ x for x in limit_data.split(';') if x ]
        return not (version in skip_versions)

    else:
        return False


def _get_latest_version(versions, limit):
    (limit, limit_data) = _setup_limit(limit)

    biggest = None
    for version in versions:
        if _respect_limit(version, limit, limit_data):
            biggest = version
            break

    if not biggest:
        return None

    for version in versions[versions.index(biggest) + 1:]:
        if _respect_limit(version, limit, limit_data) and version_gt(version, biggest):
            biggest = version

    return biggest


#######################################################################


def _all_indexes(list, item, shift = 0):
    try:
        i = list.index(item)
        real_index = i + shift
    except ValueError:
        return []

    subresult = _all_indexes(list[i+1:], item, real_index + 1)
    subresult.append(real_index)
    return subresult


# based on code from convert-to-tarball.py
def _get_version_from_files(modulename, location, files, limit):
    def is_of_interest(modulename, file):
        ''' The file is of interest if it contains the module name, and if
            either the basename of the URI matches a tarball for this module,
            or there's a query (like /download.cgi?filename=module-x.y.tar.gz)
        '''
        if not modulename in file:
            return False

        parsed = urllib.parse.urlparse(file)
        if os.path.basename(parsed.path).startswith(modulename):
            return True

        if parsed.query:
            return True

        return False

    # Only include tarballs for the given module
    tarballs = [file for file in files if is_of_interest(modulename, file)]

    # Remove fragment identifiers (anchors)
    tarballs_new = []
    for tarball in tarballs:
        index = tarball.rfind('#')
        if index != -1:
            tarball = tarball[:index]
        tarballs_new.append(tarball)
    tarballs = tarballs_new

    re_tarball = r'^.*'+modulename+'[_-]v?(([0-9]+[\.\-])*[0-9]+)\.(?:tar.*|t[bg]z2?)$'
    # Don't include -beta -installer -stub-installer and all kinds of
    # other weird-named tarballs
    tarballs = [t for t in tarballs if re.search(re_tarball, t)]

    versions = [re.sub(re_tarball, r'\1', t) for t in tarballs]

    if not len(versions):
        raise UpstreamDownloadError('No versions found')

    version = _get_latest_version(versions, limit)

    if not version:
        raise UpstreamDownloadError('No version found respecting the limits')

    indexes = _all_indexes(versions, version)
    # the list is not in the right order, because of the way we build the list
    indexes.reverse()

    latest = [tarballs[index] for index in indexes]

    tarballs = None
    if not tarballs:
        tarballs = [file for file in latest if file.endswith('.tar.xz')]
    if not tarballs:
        tarballs = [file for file in latest if file.endswith('.tar.bz2')]
    if not tarballs:
        tarballs = [file for file in latest if file.endswith('.tar.gz')]
    if not tarballs:
        tarballs = [file for file in latest if file.endswith('.tbz2')]
    if not tarballs:
        tarballs = [file for file in latest if file.endswith('.tgz')]

    if not tarballs:
        raise UpstreamDownloadError('No tarball found for version %s' % version)

    # at this point, all the tarballs we have are relevant, so just take the
    # first one
    tarball = tarballs[0]

    if urllib.parse.urlparse(tarball).scheme != '':
        # full URI
        location = tarball
    else:
        # remove files from location when we know it's not a directory
        if len(location) > 5 and location[-5:] in [ '.html' ]:
            last_slash = location.rfind('/')
            if last_slash != -1:
                location = location[:last_slash + 1]
        # add potentially missing slash to the directory
        if location[-1:] != '/':
            location = location + '/'
        location = urllib.parse.urljoin(location, tarball)

    return (location, version)


#######################################################################


def _fix_sf_location(url):
    if url and url.startswith('http://sourceforge.net/projects/') and url.endswith('/download'):
        # We want to move from:
        #    http://sourceforge.net/projects/gtkpod/files%2Fgtkpod%2Fgtkpod-2.0.2%2Fgtkpod-2.0.2.tar.gz/download
        # to:
        #    http://downloads.sourceforge.net/project/gtkpod/gtkpod/gtkpod-2.0.2/gtkpod-2.0.2.tar.gz

        # strip leading 'http://sourceforge.net/projects/' and trailing '/download'
        stripped = url[len('http://sourceforge.net/projects/'):-len('/download')]
        # find project name
        prjname = stripped[:stripped.find('/')]
        # find path to file
        files_index = stripped.find('/files%2F')
        if files_index != -1:
            path = stripped[files_index + len('/files%2F'):]
            path = path.replace('%2F', '/')
        else:
            files_index = stripped.find('/files/')
            path = stripped[files_index + len('/files/'):]

        return 'http://downloads.sourceforge.net/project/%s/%s' % (prjname, path)

    return url

def _get_version_from_sf_rss(modulename, id, limit):
    (limit, limit_data) = _setup_limit(limit)

    ids = id.split('|')
    url = 'http://sourceforge.net/api/file/index/project-id/%s/rss' % ids[0]
    if len(ids) > 1:
        # we do not want urlencode since spaces are %20 and not +
        url += '?path=/%s' % urllib.parse.quote(ids[1])

    feed = feedparser.parse(url)

    re_tarball = re.compile(r'^.*(?:/|%2F)'+re.escape(modulename)+'[_-](([0-9]+[\.\-])*[0-9]+)\.(tar.*|t[bg]z2?)/')

    biggest = '0'
    location = None
    best_ext = None

    for entry in feed['entries']:
        unquoted_link = urllib.parse.unquote(entry.link)
        match = re_tarball.match(unquoted_link)
        if not match:
            continue

        version = match.group(1)
        ext = match.group(2)
        if not version_gt(version, biggest):
            continue
        if not _respect_limit(version, limit, limit_data):
            continue

        if biggest == version:
            if best_ext in [ '.tar.xz' ]:
                continue
            elif ext in [ '.tar.xz' ]:
                pass
            if best_ext in [ '.tar.bz2', '.tbz2' ]:
                continue
            elif ext in [ '.tar.bz2', '.tbz2' ]:
                pass
            elif best_ext in [ '.tar.gz', '.tgz' ]:
                continue
            elif ext in [ '.tar.gz', '.tgz' ]:
                pass
            else:
                continue

        biggest = version
        location = entry.link
        best_ext = ext

    if biggest == '0' and location == None:
        biggest = None

    location = _fix_sf_location(location)

    return (location, biggest)


#######################################################################


def _fix_sf_jp_location(location, project):
    sf_jp = re.compile('^http://sourceforge.jp/projects/%s/downloads/([^/]+)/([^/]+)$' % project)
    match = sf_jp.match(location)
    if match:
        return 'http://sourceforge.jp/frs/redir.php?m=jaist&f=%%2F%s%%2F%s%%2F%s' % (project, match.group(1), match.group(2))

    return location


def _get_version_from_sf_jp(cache_dir, modulename, project, limit):
    url = 'http://sourceforge.jp/projects/%s/releases/' % project
    (location, files) = _get_files_from_http(url, cache_dir)

    # there's a trailing slash that will break _get_version_from_files(). For instance:
    # http://sourceforge.jp/projects/scim-imengine/downloads/29155/scim-canna-1.0.1.tar.gz/
    files = [ f[:-1] for f in files ]
    (location, version) = _get_version_from_files(modulename, location, files, limit)

    location = _fix_sf_jp_location(location, project)
    return (location, version)


#######################################################################


def _get_version_from_google_atom(name, limit):
    (limit, limit_data) = _setup_limit(limit)

    names = name.split('|')
    project = names[0]
    if len(names) > 1:
        tarball = names[1]
    else:
        tarball = project

    # See http://code.google.com/p/support/issues/detail?id=2926
    #url = 'http://code.google.com/feeds/p/%s/downloads/basic?%s' % (project, urllib.urlencode({'q': tarball}))
    url = 'http://code.google.com/feeds/p/%s/downloads/basic' % (project, )

    feed = feedparser.parse(url)

    version_re = re.compile('^\s*'+re.escape(tarball)+'[_-]((?:[0-9]+\.)*[0-9]+)\.(tar.*|t[bg]z2?)')
    download_re = re.compile('<a href="([^"]*)">Download</a>')

    biggest = '0'
    location = None

    for entry in feed['entries']:
        match = version_re.match(entry.title)
        if not match:
            continue

        version = match.group(1)
        if not version_gt(version, biggest):
            continue
        if not _respect_limit(version, limit, limit_data):
            continue

        match = download_re.search(entry.content[0]['value'])
        if match:
            download_url = match.group(1)
        else:
            download_url = 'http://code.google.com/p/%s/downloads/list' % project
        biggest = version
        location = download_url

    if biggest == '0' and location == None:
        raise UpstreamDownloadError('No versions found')

    return (location, biggest)


#######################################################################


LP_NS = '{https://launchpad.net/rdf/launchpad#}'
RDF_NS = '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}'

def _get_version_from_launchpad_series(project, limit, limit_data, series):
    url = 'https://launchpad.net/%s/%s/+rdf' % (project, series)
    release_re = re.compile('^/%s/%s/((?:[0-9]+\.)*[0-9]+)/\+rdf$' % (re.escape(project), re.escape(series)))
    biggest = '0'

    fd = urllib.request.urlopen(url)
    root = ET.parse(fd).getroot().find(LP_NS + 'ProductSeries')
    fd.close()

    for node in root.findall(LP_NS + 'release'):
        productrelease = node.find(LP_NS + 'ProductRelease')
        if productrelease is None:
            continue
        specified = productrelease.find(LP_NS + 'specifiedAt')
        release = specified.get(RDF_NS + 'resource')
        match = release_re.match(release)
        if not match:
            continue
        version = match.group(1)

        if not _respect_limit(version, limit, limit_data):
            continue

        if version_gt(version, biggest):
            biggest = version

    # TODO: this is blocked by https://bugs.launchpad.net/bugs/268359
    location = None

    return (location, biggest)


def _get_version_from_launchpad(project, limit):
    (limit, limit_data) = _setup_limit(limit)

    url = 'https://launchpad.net/%s/+rdf' % project
    series_re = re.compile('^/%s/((?:[0-9]+\.)*[0-9]+)/\+rdf$' % re.escape(project))

    fd = urllib.request.urlopen(url)
    root = ET.parse(fd).getroot().find(LP_NS + 'Product')
    fd.close()

    # always finish with trunk
    (location, biggest) = (None, '0')

    for node in root.findall(LP_NS + 'series'):
        product = node.find(LP_NS + 'ProductSeries')
        if product is None:
            continue
        specified = product.find(LP_NS + 'specifiedAt')
        series = specified.get(RDF_NS + 'resource')
        match = series_re.match(series)
        if not match:
            continue
        series_version = match.group(1)

        if not _respect_limit(series_version, limit, limit_data):
            continue

        if version_ge(biggest, series_version):
            continue

        (series_location, series_biggest) = _get_version_from_launchpad_series (project, limit, limit_data, series_version)
        if version_gt(series_biggest, biggest):
            (location, biggest) = (series_location, series_biggest)

    # try trunk, in case it exists
    try:
        (trunk_location, trunk_biggest) = _get_version_from_launchpad_series (project, limit, limit_data, 'trunk')
        if version_gt(trunk_biggest, biggest):
            (location, biggest) = (trunk_location, trunk_biggest)
    except UpstreamDownloadError:
        pass
    except urllib.error.HTTPError as e:
        if e.code != 404:
            raise e

    if location is None and biggest == '0':
        raise UpstreamDownloadError('No versions found')

    return (location, biggest)



#######################################################################


class trac_urllister(SGMLParser):
    def __init__(self, modulename):
        SGMLParser.__init__(self)
        self.modulename = modulename

    def reset(self):
        SGMLParser.reset(self)
        self.in_a = False
        self.current_url = None
        self.files = []

    def start_a(self, attrs):
        self.in_a = True
        href = [v for k, v in attrs if k=='href']
        if href:
            self.current_url = href[0]

    def handle_data(self, data):
        data = data.strip()
        if self.in_a and self.modulename in data:
            self.files.append([self.current_url, data])

    def end_a(self):
        self.in_a = False


def _get_version_from_trac(modulename, url, limit):
    # this is clearly based on _get_version_from_files, so read comments there

    obj = urllib.request.build_opener()

    # Get the files
    usock = obj.open(url)
    parser = trac_urllister(modulename)
    parser.feed(usock.read())
    usock.close()
    parser.close()
    files = parser.files

    (limit, limit_data) = _setup_limit(limit)
    re_tarball = r'^.*'+modulename+'[_-](([0-9]+[\.\-])*[0-9]+)\.(?:tar.*|t[bg]z2?)$'
    tarballs = [t for t in files if re.search(re_tarball, t[1])]
    versions = [re.sub(re_tarball, r'\1', t[1]) for t in tarballs]
    version = _get_latest_version(versions, limit)

    indexes = _all_indexes(versions, version)
    # the list is not in the right order, because of the way we build the list
    indexes.reverse()

    latest = [tarballs[index] for index in indexes]

    tarballs = None
    if not tarballs:
        tarballs = [file for file in latest if file[1].endswith('.tar.xz')]
    if not tarballs:
        tarballs = [file for file in latest if file[1].endswith('.tar.bz2')]
    if not tarballs:
        tarballs = [file for file in latest if file[1].endswith('.tar.gz')]
    if not tarballs:
        tarballs = [file for file in latest if file[1].endswith('.tbz2')]
    if not tarballs:
        tarballs = [file for file in latest if file[1].endswith('.tgz')]

    if not tarballs:
        raise UpstreamDownloadError('No tarball found for version %s' % version)

    # first tarball is fine
    tarball = tarballs[0]
    semi_url = tarball[0]

    if urllib.parse.urlparse(semi_url).scheme != '':
        # full URI
        location = semi_url
    else:
        location = urllib.parse.urljoin(url, semi_url)

    return (location, version)


#######################################################################


def get_upstream_version(cache_dir, modulename, method, additional_info, limit):
    # for branches, get the real modulename
    modulename = modulename.split('|')[0]

    if method not in [ 'upstream', 'ftpls', 'httpls', 'dualhttpls', 'subdirhttpls', 'svnls', 'sf', 'sf_jp', 'google', 'lp', 'trac' ]:
        print('Unsupported method: %s' % method, file=sys.stderr)
        return (None, None)

    if method == 'upstream':
        return (None, '--')

    elif method == 'ftpls':
        (location, files) = _get_files_from_ftp(additional_info)
        return _get_version_from_files(modulename, location, files, limit)

    elif method == 'httpls':
        (location, files) = _get_files_from_http(additional_info, cache_dir)
        return _get_version_from_files(modulename, location, files, limit)

    elif method == 'dualhttpls':
        (url1, url2) = additional_info.split('|')
        (location1, files1) = _get_files_from_http(url1, cache_dir)
        (location2, files2) = _get_files_from_http(url2, cache_dir)
        try:
            (location1, version1) = _get_version_from_files(modulename, location1, files1, limit)
        except UpstreamDownloadError:
            (location1, version1) = (None, None)

        try:
            (location2, version2) = _get_version_from_files(modulename, location2, files2, limit)
        except UpstreamDownloadError:
            (location2, version2) = (None, None)

        if version1 and version2 and version_ge(version1, version2):
            return (location1, version1)
        elif version1 and version2:
            return (location2, version2)
        elif version1:
            return (location1, version1)
        elif version2:
            return (location2, version2)
        else:
            raise UpstreamDownloadError('No versions found')

    elif method == 'subdirhttpls':
        (location, files) = _get_files_from_subdir_http(additional_info, limit)
        return _get_version_from_files(modulename, location, files, limit)

    elif method == 'svnls':
        (location, files) = _get_files_from_svn(additional_info)
        return _get_version_from_files(modulename, location, files, limit)

    elif method == 'sf':
        return _get_version_from_sf_rss(modulename, additional_info, limit)

    elif method == 'sf_jp':
        return _get_version_from_sf_jp(cache_dir, modulename, additional_info, limit)

    elif method == 'google':
        return _get_version_from_google_atom(additional_info, limit)

    elif method == 'lp':
        return _get_version_from_launchpad(additional_info, limit)

    elif method == 'trac':
        return _get_version_from_trac(modulename, additional_info, limit)


#######################################################################


def parse_limits(limits_file):
    retval = {}

    if not os.path.exists(limits_file) or not os.path.isfile(limits_file):
        return retval

    file = open(limits_file)
    lines = file.readlines()
    file.close()

    for line in lines:
        if _line_is_comment(line):
            continue

        data = line[:-1].split(':', 2)
        retval[data[0]] = data[1]

    return retval


#######################################################################


def parse_data(data_file):
    retval = {}

    if not os.path.exists(data_file) or not os.path.isfile(data_file):
        return retval

    file = open(data_file)
    lines = file.readlines()
    file.close()

    for line in lines:
        if _line_is_comment(line):
            continue

        data = line[:-1].split(':', 3)
        if data[0] != 'nonfgo':
            continue

        if data[2] != '':
            version = data[2]
        else:
            version = None

        if data[3] != '':
            location = data[3]
        else:
            location = None

        retval[data[1]] = (version, location)

    return retval


#######################################################################


def do_task(cache_dir, modulename, method, additional_info, fast_update, limits, fallback_data):
    (location, version) = (None, None)

    if fast_update and modulename in fallback_data and fallback_data[modulename][0]:
        # fast update: we don't download data if we have something in cache
        pass
    else:
        if modulename in limits:
            limit = limits[modulename]
        else:
            limit = None

        try:
            (location, version) = get_upstream_version(cache_dir, modulename, method, additional_info, limit)
        except urllib.error.URLError as e:
            print('Error when downloading information about %s: %s' % (modulename, e), file=sys.stderr)
        except urllib.error.HTTPError as e:
            print('Error when downloading information about %s: server sent %s' % (modulename, e.code), file=sys.stderr)
        except ftplib.all_errors as e:
            print('Error when downloading information about %s: %s' % (modulename, e), file=sys.stderr)
        except socket.timeout as e:
            print('Error when downloading information about %s: %s' % (modulename, e), file=sys.stderr)
        except UpstreamRawDownloadError as e:
            print('Error when downloading information about %s: %s' % (modulename, e.msg), file=sys.stderr)
        except UpstreamDownloadError as e:
            print('No matching tarball found for %s: %s' % (modulename, e.msg), file=sys.stderr)

    if modulename in fallback_data:
        fallback_version = fallback_data[modulename][0]
        fallback_location = fallback_data[modulename][1]

        if not version and not location:
            version = fallback_version
            location = fallback_location
        elif not version and location == fallback_location:
            version = fallback_version
        elif not location and version == fallback_version:
            location = fallback_location

    if version == '--':
        cat = 'upstream'
    else:
        cat = 'nonfgo'

    if location:
        location = _location_fix(location)

    return (cat, version, location)


#######################################################################


def do_cache(cache_dir, url):
    (cache, cache_error) = _get_cache_paths_from_url (url, cache_dir)

    fin = None
    fout = None
    error = None

    try:
        obj = urllib.request.build_opener()
        fin = obj.open(url)
        fout = open(cache, 'w')

        fout.write(fin.read())
    except urllib.error.URLError as e:
        error = str(e)
    except urllib.error.HTTPError as e:
        error = 'server sent %s' % e.code
    except socket.timeout as e:
        error = str(e)

    if fin:
        fin.close()
    if fout:
        fout.close()

    if error:
        fout = open(cache_error, 'w')
        fout.write(error)
        fout.close


#######################################################################


def debug_thread(s):
    global USE_DEBUG

    if not USE_DEBUG:
        return

    # compatibility with old versions of python (< 2.6)
    if hasattr(threading.currentThread(), 'name'):
        name = threading.currentThread().name
    else:
        name = threading.currentThread().getName()

    print('%s: %s' % (name, s))


#######################################################################


def thread_cache(task_cache, cache_dir):
    try:
        while True:
            if task_cache.empty():
                break

            url = task_cache.get()
            debug_thread('starting caching %s' % url)

            try:
                do_cache(cache_dir, url)
            except Exception as e:
                print('Exception in worker thread for caching %s: %s' % (url, e), file=sys.stderr)

            task_cache.task_done()

    except queue.Empty:
        pass


#######################################################################


def thread_upstream(cache_dir, task, result, fast_update, limits, fallback_data):
    try:
        while True:
            if task.empty():
                break

            (modulename, method, additional_info) = task.get()
            debug_thread('starting %s' % modulename)

            try:
                (cat, version, location) = do_task(cache_dir, modulename, method, additional_info, fast_update, limits, fallback_data)
                result.put((cat, modulename, version, location))
            except Exception as e:
                print('Exception in worker thread for %s: %s' % (modulename, e), file=sys.stderr)

            task.task_done()

    except queue.Empty:
        pass


#######################################################################


def start_threads(task_cache, cache_dir, task, result, fast_update, limits, fallback_data):
    if cache_dir:
        # First do all the cache tasks
        for i in range(min(MAX_THREADS, task.qsize())):
            t = threading.Thread(target=thread_cache, args=(task_cache, cache_dir))
            t.start()

        task_cache.join()

    # Then do all the remaining tasks
    for i in range(min(MAX_THREADS, task.qsize())):
        t = threading.Thread(target=thread_upstream, args=(cache_dir, task, result, fast_update, limits, fallback_data))
        t.start()

    task.join()


#######################################################################


def main(args):
    parser = optparse.OptionParser()

    parser.add_option('--debug', dest='debug',
                      help='only handle the argument as input and output the result')
    parser.add_option('--log', dest='log',
                      help='log file to use (default: stderr)')
    parser.add_option('--directory', dest='dir', default='.',
                      help='directory where to find data and save data')
    parser.add_option('--save-file', dest='save',
                      help='path to the file where the results will be written')
    parser.add_option('--upstream-limits', dest='upstream_limits',
                      help='path to the upstream limits data file')
    parser.add_option('--upstream-tarballs', dest='upstream_tarballs',
                      help='path to the upstream tarballs data file')
    parser.add_option('--fast-update', action='store_true',
                      default=False, dest='fast_update',
                      help='when available, use old saved data instead of looking for new data (limits will be ignored)')
    parser.add_option('--use-old-as-fallback', action='store_true',
                      default=False, dest='fallback',
                      help='if available, use old saved data as a fallback for when we cannot find new data (limits will be ignored for the fallback case)')
    parser.add_option('--only-if-old', action='store_true',
                      default=False, dest='only_if_old',
                      help='execute only if the pre-existing result file is older than 10 hours')

    (options, args) = parser.parse_args()

    fallback_data = {}

    directory = options.dir

    if options.log:
        path = os.path.realpath(options.log)
        safe_mkdir_p(os.path.dirname(path))
        sys.stderr = open(options.log, 'a')

    if options.upstream_limits:
        limit_file = options.upstream_limits
    else:
        limit_file = os.path.join(directory, 'upstream-limits.txt')

    limits = parse_limits(limit_file)

    if options.debug:
        lines = [ options.debug + '\n' ]
        out = sys.stdout

    else:
        if options.upstream_tarballs:
            upstream_file = options.upstream_tarballs
        else:
            upstream_file = os.path.join(directory, 'upstream-tarballs.txt')

        if options.save:
            save_file = options.save
        else:
            save_file = os.path.join(directory, 'versions-upstream')

        if not os.path.exists(upstream_file):
            print('Upstream data file %s does not exist.' % upstream_file, file=sys.stderr)
            return 1
        elif not os.path.isfile(upstream_file):
            print('Upstream data file %s is not a regular file.' % upstream_file, file=sys.stderr)
            return 1

        if os.path.exists(save_file):
            if not os.path.isfile(save_file):
                print('Save file %s is not a regular file.' % save_file, file=sys.stderr)
                return 1
            if options.only_if_old:
                stats = os.stat(save_file)
                # Quit if it's less than 10-hours old
                if time.time() - stats.st_mtime < 3600 * 10:
                    return 2

            if options.fallback or options.fast_update:
                fallback_data = parse_data(save_file)
        else:
            safe_mkdir_p(os.path.dirname(save_file))

        file = open(upstream_file)
        lines = file.readlines()
        file.close()

        out = open(save_file, 'w')

    # The default timeout is just too long. Use 10 seconds instead.
    socket.setdefaulttimeout(10)

    to_cache = set()
    task_cache = queue.Queue()
    task = queue.Queue()
    result = queue.Queue()

    for line in lines:
        if _line_is_comment(line):
            continue

        (modulename, method, additional_info) = line[:-1].split(':', 2)

        # We will do a cache prefetch of all http url, so that we don't have to
        # open the same url several times
        if method in [ 'httpls', 'dualhttpls' ]:
            if method == 'httpls':
                to_cache.add(additional_info)
            elif method == 'dualhttpls':
                (url1, url2) = additional_info.split('|')
                to_cache.add(url1)
                to_cache.add(url2)

        task.put((modulename, method, additional_info))

    # We'll have to remove this temporary cache
    if not options.debug:
        cache_dir = tempfile.mkdtemp(prefix='upstream-cache-', dir=os.path.dirname(save_file))
    else:
        cache_dir = None

    for url in to_cache:
        task_cache.put(url)

    start_threads(task_cache, cache_dir, task, result, options.fast_update, limits, fallback_data)

    while not result.empty():
        (cat, modulename, version, location) = result.get()

        if version and location:
            out.write('%s:%s:%s:%s\n' % (cat, modulename, version, location))
        elif version:
            out.write('%s:%s:%s:\n' % (cat, modulename, version))
        elif location:
            out.write('%s:%s::%s\n' % (cat, modulename, location))
        else:
            out.write('%s:%s::\n' % (cat, modulename))

        result.task_done()

    # Remove temporary cache
    if cache_dir:
        shutil.rmtree(cache_dir)

    if not options.debug:
        out.close()

    return 0


if __name__ == '__main__':
    try:
      ret = main(sys.argv)
      sys.exit(ret)
    except KeyboardInterrupt:
      pass
