summaryrefslogtreecommitdiffstats
path: root/module/common
diff options
context:
space:
mode:
Diffstat (limited to 'module/common')
-rw-r--r--module/common/packagetools.py132
1 files changed, 66 insertions, 66 deletions
diff --git a/module/common/packagetools.py b/module/common/packagetools.py
index d930157e1..5bfbcba95 100644
--- a/module/common/packagetools.py
+++ b/module/common/packagetools.py
@@ -1,57 +1,17 @@
-# -*- coding: utf-8 -*-
+#!/usr/bin/env python
-import re
+# JDownloader/src/jd/controlling/LinkGrabberPackager.java
+import re
from urlparse import urlparse
-
-endings = ("jdeatme", "3gp", "7zip", "7z", "abr", "ac3", "aiff", "aifc", "aif", "ai",
- "au", "avi", "apk", "bin", "bmp", "bat", "bz2", "cbr", "cbz", "ccf", "chm",
- "cr2", "cso", "cue", "cvd", "dta", "deb", "divx", "djvu", "dlc", "dmg", "doc",
- "docx", "dot", "eps", "epub", "exe", "ff", "flv", "flac", "f4v", "gsd", "gif",
- "gpg", "gz", "iwd", "idx", "iso", "ipa", "ipsw", "java", "jar", "jpe?g", "load",
- "m2ts", "m4v", "m4a", "md5", "mkv", "mp2", "mp3", "mp4", "mobi", "mov", "movie",
- "mpeg", "mpe", "mpg", "mpq", "msi", "msu", "msp", "mv", "mws", "nfo", "npk", "oga",
- "ogg", "ogv", "otrkey", "par2", "pkg", "png", "pdf", "pptx?", "ppsx?", "ppz", "pot",
- "psd", "qt", "rmvb", "rm", "rar", "ram", "ra", "rev", "rnd", "rpm", "run", "rsdf",
- "reg", "rtf", "shnf", "sh(?!tml)", "ssa", "smi", "sub", "srt", "snd", "sfv", "sfx",
- "swf", "swc", "tar\.(gz|bz2|xz)", "tar", "tgz", "tiff?", "ts", "txt", "viv", "vivo",
- "vob", "vtt", "webm", "wav", "wmv", "wma", "xla", "xls", "xpi", "zeno", "zip",
- "[r-z]\d{2}", "_[_a-z]{2}", "\d{3,4}(?=\?|$|\"|\r|\n)")
-
-rarPats = [re.compile(r'(.*)(\.|_|-)pa?r?t?\.?\d+.(rar|exe)$', re.I),
- re.compile(r'(.*)(\.|_|-)part\.?[0]*[1].(rar|exe)$', re.I),
- re.compile(r'(.*)\.rar$', re.I),
- re.compile(r'(.*)\.r\d+$', re.I),
- re.compile(r'(.*)(\.|_|-)\d+$', re.I)]
-
-zipPats = [re.compile(r'(.*)\.zip$', re.I),
- re.compile(r'(.*)\.z\d+$', re.I),
- re.compile(r'(?is).*\.7z\.[\d]+$', re.I),
- re.compile(r'(.*)\.a.$', re.I)]
-
-ffsjPats = [re.compile(r'(.*)\._((_[a-z])|([a-z]{2}))(\.|$)'),
- re.compile(r'(.*)(\.|_|-)[\d]+(\.(' + '|'.join(endings) + ')$)', re.I)]
-
-iszPats = [re.compile(r'(.*)\.isz$', re.I),
- re.compile(r'(.*)\.i\d{2}$', re.I)]
-
-pat0 = re.compile(r'www\d*\.', re.I)
-
-pat1 = re.compile(r'(\.?CD\d+)', re.I)
-pat2 = re.compile(r'(\.?part\d+)', re.I)
-
-pat3 = re.compile(r'(.+)[\.\-_]+$')
-pat4 = re.compile(r'(.+)\.\d+\.xtm$')
-
-
def matchFirst(string, *args):
- """ matches against list of regexp and returns first match """
+ """ matches against list of regexp and returns first match"""
for patternlist in args:
for pattern in patternlist:
- m = pattern.search(string)
- if m is not None:
- name = m.group(1)
+ r = pattern.search(string)
+ if r is not None:
+ name = r.group(1)
return name
return string
@@ -61,10 +21,35 @@ def parseNames(files):
""" Generates packages names from name, data lists
:param files: list of (name, data)
- :return: packagenames mapped to data lists (eg. urls)
+ :return: packagenames mapt to data lists (eg. urls)
"""
packs = {}
+ endings = "\\.(3gp|7zip|7z|abr|ac3|aiff|aifc|aif|ai|au|avi|bin|bz2|cbr|cbz|ccf|cue|cvd|chm|dta|deb|divx|djvu|dlc|dmg|doc|docx|dot|eps|exe|ff|flv|f4v|gsd|gif|gz|iwd|iso|ipsw|java|jar|jpg|jpeg|jdeatme|load|mws|mw|m4v|m4a|mkv|mp2|mp3|mp4|mov|movie|mpeg|mpe|mpg|msi|msu|msp|nfo|npk|oga|ogg|ogv|otrkey|pkg|png|pdf|pptx|ppt|pps|ppz|pot|psd|qt|rmvb|rm|rar|ram|ra|rev|rnd|r\\d+|rpm|run|rsdf|rtf|sh(!?tml)|srt|snd|sfv|swf|tar|tif|tiff|ts|txt|viv|vivo|vob|wav|wmv|xla|xls|xpi|zeno|zip|z\\d+|_[_a-z]{2}|\\d+$)"
+
+ rarPats = [re.compile("(.*)(\\.|_|-)pa?r?t?\\.?[0-9]+.(rar|exe)$", re.I),
+ re.compile("(.*)(\\.|_|-)part\\.?[0]*[1].(rar|exe)$", re.I),
+ re.compile("(.*)\\.rar$", re.I),
+ re.compile("(.*)\\.r\\d+$", re.I),
+ re.compile("(.*)(\\.|_|-)\\d+$", re.I)]
+
+ zipPats = [re.compile("(.*)\\.zip$", re.I),
+ re.compile("(.*)\\.z\\d+$", re.I),
+ re.compile("(?is).*\\.7z\\.[\\d]+$", re.I),
+ re.compile("(.*)\\.a.$", re.I)]
+
+ ffsjPats = [re.compile("(.*)\\._((_[a-z])|([a-z]{2}))(\\.|$)"),
+ re.compile("(.*)(\\.|_|-)[\\d]+(" + endings + "$)", re.I)]
+
+ iszPats = [re.compile("(.*)\\.isz$", re.I),
+ re.compile("(.*)\\.i\\d{2}$", re.I)]
+
+ pat1 = re.compile("(\\.?CD\\d+)", re.I)
+ pat2 = re.compile("(\\.?part\\d+)", re.I)
+
+ pat3 = re.compile("(.+)[\\.\\-_]+$")
+ pat4 = re.compile("(.+)\\.\\d+\\.xtm$")
+
for file, url in files:
patternMatch = False
@@ -79,7 +64,7 @@ def parseNames(files):
if len(split) > 1:
name = split.pop(1)
- #check if an already existing package may be ok for this file
+ #check if a already existing package may be ok for this file
# found = False
# for pack in packs:
# if pack in file:
@@ -87,8 +72,7 @@ def parseNames(files):
# found = True
# break
#
- # if found:
- # continue
+ # if found: continue
# unrar pattern, 7zip/zip and hjmerge pattern, isz pattern, FFSJ pattern
before = name
@@ -97,19 +81,19 @@ def parseNames(files):
patternMatch = True
# xtremsplit pattern
- m = pat4.search(name)
- if m is not None:
- name = m.group(1)
+ r = pat4.search(name)
+ if r is not None:
+ name = r.group(1)
# remove part and cd pattern
- m = pat1.search(name)
- if m is not None:
- name = name.replace(m.group(0), "")
+ r = pat1.search(name)
+ if r is not None:
+ name = name.replace(r.group(0), "")
patternMatch = True
- m = pat2.search(name)
- if m is not None:
- name = name.replace(m.group(0), "")
+ r = pat2.search(name)
+ if r is not None:
+ name = name.replace(r.group(0), "")
patternMatch = True
# additional checks if extension pattern matched
@@ -124,9 +108,9 @@ def parseNames(files):
name = name[:-length]
# remove endings like . _ -
- m = pat3.search(name)
- if m is not None:
- name = m.group(1)
+ r = pat3.search(name)
+ if r is not None:
+ name = r.group(1)
# replace . and _ with space
name = name.replace(".", " ")
@@ -139,12 +123,11 @@ def parseNames(files):
# fallback: package by hoster
if not name:
name = urlparse(file).hostname
- if name:
- name = pat0.sub("", name)
+ if name: name = name.replace("www.", "")
# fallback : default name
if not name:
- name = _("Unnamed package")
+ name = "unknown"
# build mapping
if name in packs:
@@ -153,3 +136,20 @@ def parseNames(files):
packs[name] = [url]
return packs
+
+
+if __name__ == "__main__":
+ from os.path import join
+ from pprint import pprint
+
+ f = open(join("..", "..", "testlinks2.txt"), "rb")
+ urls = [(x.strip(), x.strip()) for x in f.readlines() if x.strip()]
+ f.close()
+
+ print "Having %d urls." % len(urls)
+
+ packs = parseNames(urls)
+
+ pprint(packs)
+
+ print "Got %d urls." % sum([len(x) for x in packs.itervalues()])