summaryrefslogtreecommitdiffstats
path: root/pyload/utils/packagetools.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyload/utils/packagetools.py')
-rw-r--r--pyload/utils/packagetools.py155
1 files changed, 155 insertions, 0 deletions
diff --git a/pyload/utils/packagetools.py b/pyload/utils/packagetools.py
new file mode 100644
index 000000000..791a46d51
--- /dev/null
+++ b/pyload/utils/packagetools.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+
+# JDownloader/src/jd/controlling/LinkGrabberPackager.java
+
+import re
+from urlparse import urlparse
+
+def matchFirst(string, *args):
+ """ matches against list of regexp and returns first match"""
+ for patternlist in args:
+ for pattern in patternlist:
+ r = pattern.search(string)
+ if r is not None:
+ name = r.group(1)
+ return name
+
+ return string
+
+
+def parseNames(files):
+ """ Generates packages names from name, data lists
+
+ :param files: list of (name, data)
+ :return: packagenames mapped to data lists (eg. urls)
+ """
+ packs = {}
+
+ endings = "\\.(3gp|7zip|7z|abr|ac3|aiff|aifc|aif|ai|au|avi|bin|bz2|cbr|cbz|ccf|cue|cvd|chm|dta|deb|divx|djvu|dlc|dmg|doc|docx|dot|eps|exe|ff|flv|f4v|gsd|gif|gz|iwd|iso|ipsw|java|jar|jpg|jpeg|jdeatme|load|mws|mw|m4v|m4a|mkv|mp2|mp3|mp4|mov|movie|mpeg|mpe|mpg|msi|msu|msp|nfo|npk|oga|ogg|ogv|otrkey|pkg|png|pdf|pptx|ppt|pps|ppz|pot|psd|qt|rmvb|rm|rar|ram|ra|rev|rnd|r\\d+|rpm|run|rsdf|rtf|sh(!?tml)|srt|snd|sfv|swf|tar|tif|tiff|ts|txt|viv|vivo|vob|wav|wmv|xla|xls|xpi|zeno|zip|z\\d+|_[_a-z]{2}|\\d+$)"
+
+ rarPats = [re.compile("(.*)(\\.|_|-)pa?r?t?\\.?[0-9]+.(rar|exe)$", re.I),
+ re.compile("(.*)(\\.|_|-)part\\.?[0]*[1].(rar|exe)$", re.I),
+ re.compile("(.*)\\.rar$", re.I),
+ re.compile("(.*)\\.r\\d+$", re.I),
+ re.compile("(.*)(\\.|_|-)\\d+$", re.I)]
+
+ zipPats = [re.compile("(.*)\\.zip$", re.I),
+ re.compile("(.*)\\.z\\d+$", re.I),
+ re.compile("(?is).*\\.7z\\.[\\d]+$", re.I),
+ re.compile("(.*)\\.a.$", re.I)]
+
+ ffsjPats = [re.compile("(.*)\\._((_[a-z])|([a-z]{2}))(\\.|$)"),
+ re.compile("(.*)(\\.|_|-)[\\d]+(" + endings + "$)", re.I)]
+
+ iszPats = [re.compile("(.*)\\.isz$", re.I),
+ re.compile("(.*)\\.i\\d{2}$", re.I)]
+
+ pat1 = re.compile("(\\.?CD\\d+)", re.I)
+ pat2 = re.compile("(\\.?part\\d+)", re.I)
+
+ pat3 = re.compile("(.+)[\\.\\-_]+$")
+ pat4 = re.compile("(.+)\\.\\d+\\.xtm$")
+
+ for file, url in files:
+ patternMatch = False
+
+ if file is None:
+ continue
+
+ # remove trailing /
+ name = file.rstrip('/')
+
+ # extract last path part .. if there is a path
+ split = name.rsplit("/", 1)
+ if len(split) > 1:
+ name = split.pop(1)
+
+ #check if an already existing package may be ok for this file
+ # found = False
+ # for pack in packs:
+ # if pack in file:
+ # packs[pack].append(url)
+ # found = True
+ # break
+ #
+ # if found: continue
+
+ # unrar pattern, 7zip/zip and hjmerge pattern, isz pattern, FFSJ pattern
+ before = name
+ name = matchFirst(name, rarPats, zipPats, iszPats, ffsjPats)
+ if before != name:
+ patternMatch = True
+
+ # xtremsplit pattern
+ r = pat4.search(name)
+ if r is not None:
+ name = r.group(1)
+
+ # remove part and cd pattern
+ r = pat1.search(name)
+ if r is not None:
+ name = name.replace(r.group(0), "")
+ patternMatch = True
+
+ r = pat2.search(name)
+ if r is not None:
+ name = name.replace(r.group(0), "")
+ patternMatch = True
+
+ # additional checks if extension pattern matched
+ if patternMatch:
+ # remove extension
+ index = name.rfind(".")
+ if index <= 0:
+ index = name.rfind("_")
+ if index > 0:
+ length = len(name) - index
+ if length <= 4:
+ name = name[:-length]
+
+ # remove endings like . _ -
+ r = pat3.search(name)
+ if r is not None:
+ name = r.group(1)
+
+ # replace . and _ with space
+ name = name.replace(".", " ")
+ name = name.replace("_", " ")
+
+ name = name.strip()
+ else:
+ name = ""
+
+ # fallback: package by hoster
+ if not name:
+ name = urlparse(file).hostname
+ if name: name = name.replace("www.", "")
+
+ # fallback : default name
+ if not name:
+ name = "unknown"
+
+ # build mapping
+ if name in packs:
+ packs[name].append(url)
+ else:
+ packs[name] = [url]
+
+ return packs
+
+
+if __name__ == "__main__":
+ from os.path import join
+ from pprint import pprint
+
+ f = open(join("..", "..", "testlinks2.txt"), "rb")
+ urls = [(x.strip(), x.strip()) for x in f.readlines() if x.strip()]
+ f.close()
+
+ print "Having %d urls." % len(urls)
+
+ packs = parseNames(urls)
+
+ pprint(packs)
+
+ print "Got %d urls." % sum([len(x) for x in packs.itervalues()])