try: set except NameError: from sets import Set as set import new from re import compile from inspect import isfunction, getargspec from zope.interface import implements, directlyProvides, classProvides from twisted.plugin import IPlugin from dosage.util import fetchUrl, fetchManyUrls, getQueryParams from dosage.comic import Comic from dosage.scraper import IScraper class BasicComicModule(object): '''Base class with functions for comic modules. @type latestUrl: C{string} @cvar latestUrl: The URL for the latest comic strip. @type imageUrl: C{string} @cvar imageUrl: A string that is interpolated with the strip index to yield the URL for a particular strip. @type imageSearch: C{regex} @cvar imageSearch: A compiled regex that will locate the strip image URL when applied to the strip page. @type prevSearch: C{regex} @cvar prevSearch: A compiled regex that will locate the URL for the previous strip when applied to a strip page. ''' referrer = None help = 'Sorry, no help for this module yet.' def __init__(self): self.name = self.__class__.__name__.split('.')[-1] self.currentUrl = None self.urls = set() def getFilename(self, imageUrl, pageUrl): return None def getReferrer(self, imageUrl, pageUrl): return self.referrer or pageUrl or self.getLatestUrl() def getComic(self, url, pageUrl): if not url: return None return Comic(self.name, url, filename=self.getFilename(url, pageUrl), referrer=self.getReferrer(url, pageUrl)) def getCurrentComics(self): self.currentUrl = self.getLatestUrl() return self.getNextComics() def getNextComics(self): comics = [] while not comics and self.currentUrl and self.currentUrl not in self.urls: comicUrlGroups, prevUrl = fetchManyUrls(self.currentUrl, (self.imageSearch, self.prevSearch)) prevUrl = prevUrl and prevUrl[0] or None for comicUrl in comicUrlGroups: comics.append(self.getComic(comicUrl, self.currentUrl)) self.urls.update([self.currentUrl]) self.currentUrl = (prevUrl, None)[prevUrl in self.urls] return comics def setStrip(self, index): self.currentUrl = self.imageUrl % index def getHelp(self): return self.help def __iter__(self): """Iterate through the strips, starting from the current one and going backward.""" if not self.currentUrl: self.currentUrl = self.getLatestUrl() comics = True while comics: comics = self.getNextComics() if comics: yield comics def bind(fn, self): return new.instancemethod(fn, self, self.__class__) class Scraper(type): def __new__(cls, name, bases, dict): if 'name' not in dict: dict['name'] = name abstract = dict.pop('abstract', False) for attr, value in dict.iteritems(): if isfunction(value): args, varargs, varkw, defaults = getargspec(value) if args[:1] == ['cls']: dict[attr] = classmethod(value) newcls = super(Scraper, cls).__new__(cls, name, bases, dict) if not abstract: directlyProvides(newcls, IPlugin, IScraper) return newcls def make(self, name, **kw): return type(self)(name, (self,), kw) class BasicScraper(BasicComicModule): """ I provide a basic webcomic scraper skeleton. """ __metaclass__ = Scraper abstract = True help = 'Sorry, no help for this module yet.' def starter(cls): return cls.latestUrl def namer(cls, imageUrl, pageUrl): return None def getFilename(self, imageUrl, pageUrl): return self.namer(imageUrl, pageUrl) def getLatestUrl(self): return self.starter() def queryNamer(paramName, usePageUrl=False): @staticmethod def _namer(imageUrl, pageUrl): url = (imageUrl, pageUrl)[usePageUrl] return getQueryParams(url)[paramName][0] return _namer def regexNamer(regex): @staticmethod def _namer(imageUrl, pageUrl): return regex.search(imageUrl).group(1) return _namer def constStarter(latestUrl): @staticmethod def _starter(): return latestUrl return _starter def bounceStarter(latestUrl, nextSearch): @classmethod def _starter(cls): url = fetchUrl(latestUrl, cls.prevSearch) url = fetchUrl(url, nextSearch) return url return _starter def indirectStarter(baseUrl, latestSearch): @staticmethod def _starter(): return fetchUrl(baseUrl, latestSearch) return _starter class IndirectLatestMixin(object): ''' Mixin for comics modules that link to the latest comic from a base page of some kind. This also supports comics which don't link to the last comic from the base page, but the beginning of the latest chapter or similiar schemes. It simulates going forward until it can't find a 'next' link as specified by the 'nextSearch' regex. @type baseUrl: C{string} @cvar baseUrl: the URL where the link to the latest comic is found. @type latestSearch C{regex} @cvar latestSearch: a compiled regex for finding the 'latest' URL. @type nextSearch C{regex} @cvar nextSearch: a compiled regex for finding the 'next' URL. ''' __latestUrl = None def getLatestUrl(self): if not self.__latestUrl: self.__latestUrl = fetchUrl(self.baseUrl, self.latestSearch) if hasattr(self, "nextSearch"): nextUrl = fetchUrl(self.__latestUrl, self.nextSearch) while nextUrl: self.__latestUrl = nextUrl nextUrl = fetchUrl(self.__latestUrl, self.nextSearch) return self.__latestUrl latestUrl = property(getLatestUrl) class PHPScraper(BasicScraper): """ I implement IScraper for comics using phpComic/CUSP. This provides an easy way to define scrapers for webcomics using phpComic. """ abstract = True imageUrl = property(lambda self: self.basePath + 'daily.php?date=%s') imageSearch = property(lambda self: compile(r'[^]+ src="(%scomics/\d{6}\..+?)">' % (basePath,))) help = 'Index format: yymmdd' def starter(cls): return cls.basePath + cls.latestUrl