diff options
Diffstat (limited to 'pyload/utils/packagetools.py')
-rw-r--r-- | pyload/utils/packagetools.py | 155 |
1 files changed, 155 insertions, 0 deletions
diff --git a/pyload/utils/packagetools.py b/pyload/utils/packagetools.py new file mode 100644 index 000000000..791a46d51 --- /dev/null +++ b/pyload/utils/packagetools.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python + +# JDownloader/src/jd/controlling/LinkGrabberPackager.java + +import re +from urlparse import urlparse + +def matchFirst(string, *args): + """ matches against list of regexp and returns first match""" + for patternlist in args: + for pattern in patternlist: + r = pattern.search(string) + if r is not None: + name = r.group(1) + return name + + return string + + +def parseNames(files): + """ Generates packages names from name, data lists + + :param files: list of (name, data) + :return: packagenames mapped to data lists (eg. urls) + """ + packs = {} + + endings = "\\.(3gp|7zip|7z|abr|ac3|aiff|aifc|aif|ai|au|avi|bin|bz2|cbr|cbz|ccf|cue|cvd|chm|dta|deb|divx|djvu|dlc|dmg|doc|docx|dot|eps|exe|ff|flv|f4v|gsd|gif|gz|iwd|iso|ipsw|java|jar|jpg|jpeg|jdeatme|load|mws|mw|m4v|m4a|mkv|mp2|mp3|mp4|mov|movie|mpeg|mpe|mpg|msi|msu|msp|nfo|npk|oga|ogg|ogv|otrkey|pkg|png|pdf|pptx|ppt|pps|ppz|pot|psd|qt|rmvb|rm|rar|ram|ra|rev|rnd|r\\d+|rpm|run|rsdf|rtf|sh(!?tml)|srt|snd|sfv|swf|tar|tif|tiff|ts|txt|viv|vivo|vob|wav|wmv|xla|xls|xpi|zeno|zip|z\\d+|_[_a-z]{2}|\\d+$)" + + rarPats = [re.compile("(.*)(\\.|_|-)pa?r?t?\\.?[0-9]+.(rar|exe)$", re.I), + re.compile("(.*)(\\.|_|-)part\\.?[0]*[1].(rar|exe)$", re.I), + re.compile("(.*)\\.rar$", re.I), + re.compile("(.*)\\.r\\d+$", re.I), + re.compile("(.*)(\\.|_|-)\\d+$", re.I)] + + zipPats = [re.compile("(.*)\\.zip$", re.I), + re.compile("(.*)\\.z\\d+$", re.I), + re.compile("(?is).*\\.7z\\.[\\d]+$", re.I), + re.compile("(.*)\\.a.$", re.I)] + + ffsjPats = [re.compile("(.*)\\._((_[a-z])|([a-z]{2}))(\\.|$)"), + re.compile("(.*)(\\.|_|-)[\\d]+(" + endings + "$)", re.I)] + + iszPats = [re.compile("(.*)\\.isz$", re.I), + re.compile("(.*)\\.i\\d{2}$", re.I)] + + pat1 = re.compile("(\\.?CD\\d+)", re.I) + pat2 = re.compile("(\\.?part\\d+)", re.I) + + pat3 = re.compile("(.+)[\\.\\-_]+$") + pat4 = re.compile("(.+)\\.\\d+\\.xtm$") + + for file, url in files: + patternMatch = False + + if file is None: + continue + + # remove trailing / + name = file.rstrip('/') + + # extract last path part .. if there is a path + split = name.rsplit("/", 1) + if len(split) > 1: + name = split.pop(1) + + #check if an already existing package may be ok for this file + # found = False + # for pack in packs: + # if pack in file: + # packs[pack].append(url) + # found = True + # break + # + # if found: continue + + # unrar pattern, 7zip/zip and hjmerge pattern, isz pattern, FFSJ pattern + before = name + name = matchFirst(name, rarPats, zipPats, iszPats, ffsjPats) + if before != name: + patternMatch = True + + # xtremsplit pattern + r = pat4.search(name) + if r is not None: + name = r.group(1) + + # remove part and cd pattern + r = pat1.search(name) + if r is not None: + name = name.replace(r.group(0), "") + patternMatch = True + + r = pat2.search(name) + if r is not None: + name = name.replace(r.group(0), "") + patternMatch = True + + # additional checks if extension pattern matched + if patternMatch: + # remove extension + index = name.rfind(".") + if index <= 0: + index = name.rfind("_") + if index > 0: + length = len(name) - index + if length <= 4: + name = name[:-length] + + # remove endings like . _ - + r = pat3.search(name) + if r is not None: + name = r.group(1) + + # replace . and _ with space + name = name.replace(".", " ") + name = name.replace("_", " ") + + name = name.strip() + else: + name = "" + + # fallback: package by hoster + if not name: + name = urlparse(file).hostname + if name: name = name.replace("www.", "") + + # fallback : default name + if not name: + name = "unknown" + + # build mapping + if name in packs: + packs[name].append(url) + else: + packs[name] = [url] + + return packs + + +if __name__ == "__main__": + from os.path import join + from pprint import pprint + + f = open(join("..", "..", "testlinks2.txt"), "rb") + urls = [(x.strip(), x.strip()) for x in f.readlines() if x.strip()] + f.close() + + print "Having %d urls." % len(urls) + + packs = parseNames(urls) + + pprint(packs) + + print "Got %d urls." % sum([len(x) for x in packs.itervalues()]) |