
Ec        
   @   s,  d  Z  d d l Z e j d k  rB d GHd e j GHe j d  n  d d l Z d d l Z d d l Z d d l Z d d l	 Z	 d d l
 Z
 d d l Z d d l Z d d l Z d d l Z d d l Z d d l Z y e Z [ Wn e k
 r d Z d Z n Xd Z d	 Z d
 Z d d d d d d d d d d g
 Z d d d g Z d Z d Z e
 j d  Z e e
 j d d d d d d d  g  Z  d! d" d# d$ d% d& d' g Z! d( Z" d) Z# d* Z$ d+ Z% d, Z& d- Z' d. Z( d/ d0 d1 i  d2 d3 f g Z) d4 e* f d5     YZ+ d6 e+ f d7     YZ, d8 f  d9     YZ- e-   Z. d: f  d;     YZ/ e/   Z0 d< e1 f d=     YZ2 d> f  d?     YZ3 d@ f  dA     YZ4 dB f  dC     YZ5 dD f  dE     YZ6 dF f  dG     YZ7 dH e j8 j9 j: f dI     YZ; dJ f  dK     YZ< dL f  dM     YZ= dN e j8 j9 j: f dO     YZ> dP   Z? dQ   Z@ dR   ZA dS   ZB dT   ZC dU   ZD eE dV k r(eD e jF d  ZG eG peG jH dW  peG jH dX  re0 jI e  d  nt eG jH dY  ZJ eC eG dW eJ  ZK eK re0 jI dZ d  n8 eK jL   e0 jI d[ e0 jM d  e0 jI d\ e0 jN d  n  d S(]   sW  A simple script to automatically produce sitemaps for a webserver,
in the Google Sitemap Protocol (GSP).

Usage: python sitemap_gen.py --config=config.xml [--help] [--testing]
            --config=config.xml, specifies config file location
            --help, displays usage message
            --testing, specified when user is experimenting
iNi  s)   This script requires Python 2.2 or later.s   Currently run with version: %si   i    t   ASCIIs   UTF-8t   IDNAs   US-ASCIIt   USt   IBM367t   CP367s   ISO646-USISO_646.IRV:1991s   ISO-IR-6s   ANSI_X3.4-1968s   ANSI_X3.4-1986t   CPASCIIs
   ISO-8859-1s
   ISO-8859-2s
   ISO-8859-5iP  s
   _index.xmls5   .+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*s
   ^\d\d\d\d$s   ^\d\d\d\d-\d\d$s   ^\d\d\d\d-\d\d-\d\d$s   ^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$s+   ^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$s,   ^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$s8   ^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$t   alwayst   hourlyt   dailyt   weeklyt   monthlyt   yearlyt   nevers4  <?xml version="1.0" encoding="UTF-8"?>
<sitemapindex
  xmlns="http://www.google.com/schemas/sitemap/0.84"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84
                      http://www.google.com/schemas/sitemap/0.84/siteindex.xsd">
s   </sitemapindex>
sM    <sitemap>
  <loc>%(loc)s</loc>
  <lastmod>%(lastmod)s</lastmod>
 </sitemap>
s,  <?xml version="1.0" encoding="UTF-8"?>
<urlset
  xmlns="http://www.google.com/schemas/sitemap/0.84"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84
                      http://www.google.com/schemas/sitemap/0.84/sitemap.xsd">
s
   </urlset>
s    <url>
s    </url>
t   https   www.google.coms   webmasters/sitemaps/pingt    t   sitemapt   Errorc           B   s   e  Z d  Z RS(   s   
  Base exception class.  In this module we tend not to use our own exception
  types for very much, but they come in very handy on XML parsing with SAX.
  (   t   __name__t
   __module__t   __doc__(    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR      s   t   SchemaErrorc           B   s   e  Z d  Z RS(   s?   Failure to process an XML file according to the schema we know.(   R   R   R   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR      s   t   Encoderc           B   s;   e  Z d  Z d   Z d   Z d   Z d   Z d   Z RS(   s  
  Manages wide-character/narrow-character conversions for just about all
  text that flows into or out of the script.

  You should always use this class for string coercion, as opposed to
  letting Python handle coercions automatically.  Reason: Python
  usually assumes ASCII (7-bit) as a default narrow character encoding,
  which is not the kind of data we generally deal with.

  General high-level methodologies used in sitemap_gen:

  [PATHS]
  File system paths may be wide or narrow, depending on platform.
  This works fine, just be aware of it and be very careful to not
  mix them.  That is, if you have to pass several file path arguments
  into a library call, make sure they are all narrow or all wide.
  This class has MaybeNarrowPath() which should be called on every
  file system path you deal with.

  [URLS]
  URL locations are stored in Narrow form, already escaped.  This has the
  benefit of keeping escaping and encoding as close as possible to the format
  we read them in.  The downside is we may end up with URLs that have
  intermingled encodings -- the root path may be encoded in one way
  while the filename is encoded in another.  This is obviously wrong, but
  it should hopefully be an issue hit by very few users.  The workaround
  from the user level (assuming they notice) is to specify a default_encoding
  parameter in their config file.

  [OTHER]
  Other text, such as attributes of the URL class, configuration options,
  etc, are generally stored in Unicode for simplicity.
  c         C   s  d  |  _ g  |  _ t |  _ y t j j |  _ WnA t k
 rq y t	 j
   t j k |  _ Wqr t k
 rm qr Xn Xy7 t	 j   } | r | j   t k r | g |  _ n  Wn t k
 r n X|  j s t	 j   } | r | j   t k r | g |  _ q n  |  j st |  _ n  d  S(   N(   t   Nonet   _usert   _learnedt   Falset
   _widefilest   ost   patht   supports_unicode_filenamest   AttributeErrort   syst   getwindowsversiont   VER_PLATFORM_WIN32_NTt   getfilesystemencodingt   uppert   ENC_ASCII_LISTt   getdefaultencodingt   ENC_DEFAULT_LIST(   t   selft   encoding(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   __init__   s,    					c         C   s   | |  _  d  S(   N(   R   (   R'   R(   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   SetUserEncoding   s    c         C   sh  t  |  t j k r | S| r y9 | j |  } | |  j k rS |  j j |  n  | SWq t k
 rk q t k
 r t j	 d |  q Xn  |  j
 r y | j |  j
  SWq t k
 r q t k
 r |  j
 } d |  _
 t j	 d |  q Xn  x9 |  j r2y | j |  j d  SWq |  j d =q Xq Wy | j t  SWn t k
 rWn X| j t d  S(   s"    Narrow a piece of arbitrary text s   Unknown encoding: %ss   Unknown default_encoding: %si    t   ignoreN(   t   typet   typest   UnicodeTypet   encodeR   t   appendt   UnicodeErrort   LookupErrort   outputt   WarnR   R   t   ENC_UTF8t	   ENC_ASCII(   R'   t   textR(   t   resultt   temp(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt
   NarrowText   s>    			c         C   s   |  j  r | S|  j | d  S(   s#    Paths may be allowed to stay wide N(   R   R:   R   (   R'   R7   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   MaybeNarrowPath'  s    	c         C   s  t  |  t j k r | S| r y9 t | |  } | |  j k rS |  j j |  n  | SWq t k
 rk q t k
 r t j	 d |  q Xn  |  j
 r y t | |  j
  SWq t k
 r q t k
 r |  j
 } d |  _
 t j	 d |  q Xn  x9 |  j r2y t | |  j d  SWq |  j d =q Xq Wy t | t  SWn t k
 rWn Xt j	 d |  |  j
 st j	 d  n  | j t d  S(   s!    Widen a piece of arbitrary text s   Unknown encoding: %ss   Unknown default_encoding: %si    s!   Unrecognized encoding in text: %ssB   You may need to set a default_encoding in your configuration file.R+   N(   R,   R-   t
   StringTypet   unicodeR   R0   R1   R2   R3   R4   R   R   R5   t   decodeR6   (   R'   R7   R(   R8   R9   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt	   WidenText.  sD    				(   R   R   R   R)   R*   R:   R;   R?   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR      s   !	 		-	t   Outputc           B   sD   e  Z d  Z d   Z d   Z d   Z d   Z d   Z d   Z RS(   s  
  Exposes logging functionality, and tracks how many errors
  we have thus output.

  Logging levels should be used as thus:
    Fatal     -- extremely sparingly
    Error     -- config errors, entire blocks of user 'intention' lost
    Warn      -- individual URLs lost
    Log(,0)   -- Un-suppressable text that's not an error
    Log(,1)   -- touched files, major actions
    Log(,2)   -- parsing notes, filtered or duplicated URLs
    Log(,3)   -- each accepted URL
  c         C   s1   d |  _  d |  _ i  |  _ i  |  _ d |  _ d  S(   Ni    (   t
   num_errorst	   num_warnst   _errors_shownt   _warns_shownt   _verbose(   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)   r  s
    				c         C   s6   | r2 t  j | d  } |  j | k r2 | GHq2 n  d S(   sC    Output a blurb of diagnostic text, if the verbose level allows it N(   t   encoderR:   R   RE   (   R'   R7   t   level(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Log{  s    c         C   s   | r t  j | d  } t j |  j   } |  j j |  sX d |  j | <d | GHn |  j d | d  |  j	 d |  _	 n  d S(   s;    Output and count a warning.  Suppress duplicate warnings. i   s
   [WARNING] s   (suppressed) [WARNING] i   N(
   RF   R:   R   t   md5t   newt   digestRD   t   has_keyRH   RB   (   R'   R7   t   hash(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR4     s    c         C   s   | r t  j | d  } t j |  j   } |  j j |  sX d |  j | <d | GHn |  j d | d  |  j	 d |  _	 n  d S(   s8    Output and count an error.  Suppress duplicate errors. i   s   [ERROR] s   (suppressed) [ERROR] i   N(
   RF   R:   R   RI   RJ   RK   RC   RL   RH   RA   (   R'   R7   RM   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s    c         C   s:   | r$ t  j | d  } d | GHn d GHt j d  d S(   s,    Output an error and terminate the program. s   [FATAL] s   Fatal error.i   N(   RF   R:   R   R   t   exit(   R'   R7   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Fatal  s
    c         C   sv   yM t  |  t j k r' t |  } n  | d k rL | d k rL | |  _ d SWn t k
 r` n X|  j d |  d S(   s    Sets the verbose level. i    i   Ns5   Verbose level (%s) must be between 0 and 3 inclusive.(   R,   R-   t   IntTypet   intRE   t
   ValueErrorR   (   R'   RG   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt
   SetVerbose  s    	(	   R   R   R   R)   RH   R4   R   RO   RS   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR@   c  s   						
t   URLc           B   s   e  Z d  Z d Z d   Z d   Z d   Z d   Z e e  Z d	   Z	 e e	  Z	 d
   Z
 d   Z d d d  Z d   Z RS(   sg    URL is a smart structure grouping together the properties we
  care about for a single web reference. t   loct   lastmodt
   changefreqt   priorityc         C   s(   d  |  _ d  |  _ d  |  _ d  |  _ d  S(   N(   R   RU   RV   RW   RX   (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    			c         C   s0   |  j  | j  k  r d S|  j  | j  k r, d Sd S(   Nii   i    (   RU   (   R'   t   other(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   __cmp__  s
    c         C   s^   | d k r! |  j  |  |  _ n9 y t |  | |  Wn" t k
 rY t j d |  n Xd S(   sT    Attempt to set the attribute to the value, with a pretty try
    block around it.  RU   s   Unknown URL attribute: %sN(   t   CanonicalizeRU   t   setattrR   R3   R4   (   R'   t	   attributet   value(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   TrySetAttribute  s    c         C   sP   |  s
 t  St j |  d  } t j |  \ } } } } } | sH | rL t  St S(   s&    Decide if the URL is absolute or not N(   R   RF   R:   R   t   urlparset   urlsplitt   True(   RU   t   narrowt   schemet   netlocR   t   queryt   frag(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt
   IsAbsolute  s    c         C   s0  |  s
 |  St  j |  d  } t j |  \ } } } } } d } d } t j | | | d  } t j | | | d  } t j | | | d  } t j | | | d  } yr t j |   \ }	 }
 }	 }	 }	 xM |
 D]E } | t d  k r |
 j t	  } t j | | | d  } Pq q WWn. t
 k
 r3n t k
 rPt j d  n Xt } d | k rlt } n  t j | | | | | f  } d	 } | j d  } | d
 } | d
 =x_ | D]W } t |  d k r | d
 | k r | d | k r | d | } q| d | } qW| r,t j d |  n  | S(   s2    Do encoding and canonicalization on a URL string s   -._~s   !$&'()*+,;=s   %:@/[]s   %:@/s   %:@/?i   s   An International Domain Name (IDN) is being used, but this version of Python does not have support for IDNA encoding.  (IDNA support was introduced in Python 2.3)  The encoding we have used instead is wrong and will probably not yield valid URLs.t   %t   0123456789abcdefABCDEFi    i   i   s   %25s;   Invalid characters in the host or domain portion of a URL: N(   RF   R:   R   R`   Ra   t   urllibt   quotet   unichrR/   t   ENC_IDNAR1   R2   R3   R4   R   Rb   t
   urlunsplitt   splitt   len(   RU   Rc   Rd   Re   R   Rf   Rg   t   unrt   subR+   t
   widenetloct   ct
   bad_netloct   HEXDIGt   listt   item(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR[     sJ    	
2	c         C   s  t  |  t j k s t  |  j s5 t j d  t S| rV t j	 | |  j  |  _ n  |  j j
 |  s t j d |  j  d	 |  _ t S|  j r	t } |  j j   |  _ x* t D]" } | j |  j  } | r Pq q W| s	t j d |  j |  j f  d	 |  _ q	n  |  j rt } |  j j   |  _ x' t D] } |  j | k r1t } Pq1q1W| st j d |  j |  j f  d	 |  _ qn  |  j r d } y t |  j  } Wn t k
 rn X| d k  s| d k r t j d |  j |  j f  d	 |  _ q n  t S(
   sB    Verify the data in this URL is well-formed, and override if not. s	   Empty URLs4   Discarded URL for not starting with the base_url: %ss?   Lastmod "%s" does not appear to be in ISO8601 format on URL: %ss;   Changefreq "%s" is not a valid change frequency on URL : %sg      g        g      ?sB   Priority "%s" is not a number between 0 and 1 inclusive on URL: %sN(   R,   R-   R<   t   AssertionErrorRU   R3   R4   R   R`   t   urljoint
   startswithR   RV   R#   t   LASTMOD_PATTERNSt   matchRW   t   lowert   CHANGEFREQ_PATTERNSRb   RX   t   floatRR   (   R'   t   base_urlt   allow_fragmentR~   t   patternRX   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Validate!  sX    									c         C   sO   |  j  s d S|  j  j d  r9 t j |  j  d   j   St j |  j   j   S(   s(    Provides a uniform way of hashing URLs t   /iN(   RU   R   t   endswithRI   RJ   RK   (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   MakeHashZ  s
    	RT   i   c         C   st   | d } xC |  j  D]8 } t |  |  } | s8 d } n  | d | | f } q Wt j d t j | d  |  d S(   s.    Dump the contents, empty or not, to the log. t   :R   s	     %s=[%s]s   %sN(   t	   __slots__t   getattrR3   RH   RF   R:   R   (   R'   t   prefixRG   t   outR]   R^   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRH   c  s    
	c         C   s   |  j  s d St } x |  j D] } t |  |  } | r t |  t j k rb t j | d  } n$ t |  t j
 k r t |  } n  t j j j |  } | d | | | f } q q W| t } | j |  d S(   s<    Dump non-empty contents to the output file, in XML format. Ns     <%s>%s</%s>
(   RU   t   SITEURL_XML_PREFIXR   R   R,   R-   R.   RF   R:   R   R<   t   strt   xmlt   saxt   saxutilst   escapet   SITEURL_XML_SUFFIXt   write(   R'   t   fileR   R]   R^   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   WriteXMLp  s    	
(   s   locs   lastmods
   changefreqs   priority(   R   R   R   R   R)   RZ   R_   Rh   t   staticmethodR[   R   R   RH   R   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRT     s   				
	<	9		t   Filterc           B   s    e  Z d  Z d   Z d   Z RS(   sQ  
  A filter on the stream of URLs we find.  A filter is, in essence,
  a wildcard applied to the stream.  You can think of this as an
  operator that returns a tri-state when given a URL:

    True  -- this URL is to be included in the sitemap
    None  -- this URL is undecided
    False -- this URL is to be dropped from the sitemap
  c         C   s  d  |  _ d  |  _ t |  _ t d | d  s1 d  St j } | j d  } | j d d  } | j d d  } | r | j	   } n  | r | j	   } n  | s t j
 d  nW | s | d k r | d k r t j
 d	  n( | d
 k r| d k rt j
 d  n  | d k rt |  _ n | d
 k r4t |  _ n  | d k rL| |  _ nM | d k ry t j |  |  _ Wqt j k
 rt j
 d |  qXn  | t j k rt j d | | | f d  n  d  S(   Nt   FILTERR   R,   t   actiont   wildcardt   drops1   On a filter you must specify a "pattern" to matcht   regexpsH   On a filter you must specify either 'type="wildcard"' or 'type="regexp"'t   passsT   If you specify a filter action, it must be either 'action="pass"' or 'action="drop"'s   Bad regular expression: %ss'   Filter: %s any URL that matches %s "%s"i   (   s   patterns   types   action(   R   t	   _wildcardt   _regexpR   t   _passt   ValidateAttributesR3   RA   t   getR   R   Rb   t   ret   compilet   errorRH   (   R'   t
   attributesRA   R   R,   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     sD    							c         C   sz   | s | j  r d S|  j rA t j | j  |  j  r= |  j Sd S|  j rj |  j j | j   rf |  j Sd St sv t	  d S(   s    Process the URL, as above. N(
   RU   R   R   t   fnmatcht   fnmatchcaseR   R   t   searchR   Rz   (   R'   t   url(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Apply  s    		(   R   R   R   R)   R   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s   		3t   InputURLc           B   s    e  Z d  Z d   Z d   Z RS(   s   
  Each Input class knows how to yield a set of URLs from a data source.

  This one handles a single URL, manually specified in the config file.
  c         C   s   d  |  _ t d | d
  s d  St   } xK | j   D]= } | d k r^ | j d | |  q5 | j | | |  q5 W| j s t j d  d  S| |  _ t j	 d |  j j d	  d  S(   NRT   t   hrefRV   RW   RX   RU   s(   Url entries must have an href attribute.s   Input: From URL "%s"i   (   s   hrefs   lastmods
   changefreqs   priority(
   R   t   _urlR   RT   t   keysR_   RU   R3   R   RH   (   R'   R   R   t   attr(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    						c         C   s    |  j  r | |  j  t  n  d S(   sD    Produces URLs from our data source, hands them in to the consumer. N(   R   Rb   (   R'   t   consumer(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   ProduceURLs  s    	(   R   R   R   R)   R   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s   	t   InputURLListc           B   s    e  Z d  Z d   Z d   Z RS(   s   
  Each Input class knows how to yield a set of URLs from a data source.

  This one handles a text file with a list of URLs
  c         C   s   d  |  _ d  |  _ t d | d  s( d  S| j d  |  _ | j d t  |  _ |  j r t j |  j  |  _ t j	 j
 |  j  r t j d |  j d  q t j d |  j  d  |  _ n t j d  d  S(	   Nt   URLLISTR   R(   s   Input: From URLLIST "%s"i   s   Can not locate file: %ss-   Urllist entries must have a "path" attribute.(   s   paths   encoding(   R   t   _patht	   _encodingR   R   R5   RF   R;   R   R   t   isfileR3   RH   R   (   R'   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    			c         C   s  t  |  j d  \ } } | s" d Sd } xa| j   D]S} | d } |  j rf t j | |  j  } n  | j   } | s5 | d d k r q5 n  t   } | j d  } x1 t	 d t
 |   D] } | | j   | | <q W| j d | d  x t	 d t
 |   D]r } | | ry0 | | j d d  \ }	 }
 | j |	 |
  Wqwt k
 rst j d	 | | | f  qwXqqW| | t  q5 W| j   | r| j   n  d S(
   sD    Produces URLs from our data source, hands them in to the consumer. R   Ni    i   t   #t    RU   t   =s&   Line %d: Unable to parse attribute: %s(   t   OpenFileForReadR   t	   readlinesR   RF   R?   t   stripRT   Rp   t   rangeRq   R_   RR   R3   R4   R   t   close(   R'   R   t   frameR   t   linenumt   lineR   t   colst   it	   attr_namet   attr_val(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s8    
		
	
(   R   R   R   R)   R   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s   	t   InputDirectoryc           B   s    e  Z d  Z d   Z d   Z RS(   s   
  Each Input class knows how to yield a set of URLs from a data source.

  This one handles a directory that acts as base for walking the filesystem.
  c         C   s  d  |  _ d  |  _ d  |  _ t d | d  s1 d  S| j d  } | sW t j d  d  St j	 |  } | j
 t j  s | t j } n  t j j |  s t j d |  d  S| j d  } | s t j d  d  St j |  } | j
 d  s | d } n  | j |  sNt j | |  } | j |  sNt j d | | f  d  Sn  | j d  } | rt j	 |  } t j | k rt j d	 |  d  } qn  | |  _ | |  _ | |  _ | rt j d
 | | | f d  n t j d | | f d  d  S(   Nt	   DIRECTORYR   R   t   default_files<   Directory entries must have both "path" and "url" attributess   Can not locate directory: %sR   s:   The directory URL "%s" is not relative to the base_url: %ss7   The default_file "%s" can not include path information.s6   Input: From DIRECTORY "%s" (%s) with default file "%s"i   s4   Input: From DIRECTORY "%s" (%s) with no default file(   s   paths   urls   default_file(   R   R   R   t   _default_fileR   R   R3   R   RF   R;   R   R   t   sepR   t   isdirRT   R[   R|   R`   R{   RH   (   R'   R   R   R   R   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)   N  sX    												c            s   |  j  s d S|  j   |  j   |  j       f d      f d   } t j d |  j  d   |  j  d  t j j |  j  | d  d S(   sD    Produces URLs from our data source, hands them in to the consumer. Nc            s  t    } t } y | r0 t j j |  |  } n |  } t j j |  } d } | r  r t j j |   } y t j |  t j } Wq t	 k
 r q Xn  | s t j |  t j } n  t
 |  | _ Wn! t	 k
 r n t k
 r n X|  t   } t j d k r*| j t j d  } n  | r=| d } n  | rc| | } | rc| d } qcn  | j d   t j | d   | r | k r| j d d d d  d S | t  d S(   sn   
      Called once per file.
      Note that 'name' will occasionally be None -- for a directory itself
      R   RU   R   s   IGNORED (default file)RG   i   N(   RT   R   R   R   t   joinR   R   t   statt   ST_MTIMEt   OSErrort   TimestampISO8601RV   RR   Rq   R   t   replaceR_   RF   R?   RH   (   t   dirpatht   nameR   R   R   t   timeR   t   middle(   t   root_URLt	   root_filet	   root_pathR   (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   PerFile  sF    	
 c            sL   |  }  | j     s* t j d |  d Sx | D] }  | |  q1 Wd S(   sT   
      Called once per directory with a list of all the contained files/dirs.
      s8   Unable to decide what the root path is for directory: %sN(   R|   R3   R4   (   R+   R   t   namelistR   (   R   R   (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   PerDirectory  s    	s   Walking DIRECTORY "%s"i   (	   R   R   R   R3   RH   R   R   R   t   walk(   R'   R   R   (    (   R   R   R   R   R   s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s    				1(   R   R   R   R)   R   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR   G  s   	:t   InputAccessLogc           B   sD   e  Z d  Z d   Z d   Z d   Z d   Z d   Z d   Z RS(   s-  
  Each Input class knows how to yield a set of URLs from a data source.

  This one handles access logs.  It's non-trivial in that we want to
  auto-detect log files in the Common Logfile Format (as used by Apache,
  for instance) and the Extended Log File Format (as used by IIS, for
  instance).
  c         C   s  d  |  _ d  |  _ t |  _ t |  _ d |  _ d |  _ d |  _ d |  _	 d |  _
 t d | d	  sg d  S| j d  |  _ | j d t  |  _ |  j r t j |  j  |  _ t j j |  j  r t j d |  j d  qt j d |  j  d  |  _ n t j d  d  S(
   Nit	   ACCESSLOGR   R(   s   Input: From ACCESSLOG "%s"i   s   Can not locate file: %ss/   Accesslog entries must have a "path" attribute.(   s   paths   encoding(   R   R   R   R   t   _is_elft   _is_clft   _elf_statust   _elf_methodt   _elf_urit   _elf_urifrag1t   _elf_urifrag2R   R   R5   RF   R;   R   R   R   R3   RH   R   (   R'   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s(    										c         C   s   | j  d  s t S| j d  } | d =x t d t |   D] } | | j   } | d k rm | |  _ q? | d k r | |  _ q? | d k r | |  _ q? | d k r | |  _	 q? | d k r? | |  _
 q? q? Wt j d	 d
  t S(   s7    Recognize the Fields directive that heads an ELF file s   #Fields:R   i    s	   sc-statuss	   cs-methods   cs-uris   cs-uri-stems   cs-uri-querys,   Recognized an Extended Log File Format file.i   (   R|   R   Rp   R   Rq   R   R   R   R   R   R   R3   RH   Rb   (   R'   R   t   fieldsR   t   field(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   RecognizeELFLine  s$    c         C   s  | j  d  } t |  } |  j d k r] |  j | k r= d S| |  j j   d k s] d Sn  |  j d k r |  j | k r d S| |  j j   d	 k r d Sn  |  j d k r |  j | k r d S| |  j j   } | d k r | Sn  |  j d k r|  j | k s|  j | k rd S| |  j j   } d } |  j d k rP| |  j } n  | r| d k r| r| d k r| d | } n  | Sn  d S(
   s*    Fetch the requested URL from an ELF line R   i    t   200t   HEADt   GETt   -t   ?N(   R   R   (	   Rp   Rq   R   R   R   R   R   R   R   (   R'   R   R   t   countR   t   urlfrag1t   urlfrag2(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt
   GetELFLine  s<    c         C   sG   t  j |  } | o' | j d  d k } | rC t j d d  n  | S(   sR    Try to tokenize a logfile line according to CLF pattern and see if
    it works. i   R   R   s(   Recognized a Common Logfile Format file.i   (   s   HEADs   GET(   t   ACCESSLOG_CLF_PATTERNR~   t   groupR3   RH   (   R'   R   R~   t	   recognize(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   RecognizeCLFLine>  s
    c         C   sD   t  j |  } | r@ | j d  } | d k r@ | j d  Sn  d S(   s)    Fetch the requested URL from a CLF line i   R   R   i   (   s   HEADs   GETN(   R   R~   R   R   (   R'   R   R~   t   request(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt
   GetCLFLineH  s    c         C   s0  t  |  j d  \ } } | s" d Sx | j   D] } |  j rV t j | |  j  } n  | j   } |  j r |  j r |  j	 |  |  _ |  j
 |  |  _ n  d } |  j r |  j |  } n |  j r |  j |  } n  | s q/ n  t   } | j d |  | | t  q/ W| j   | r,| j   n  d S(   sD    Produces URLs from our data source, hands them in to the consumer. R   NRU   (   R   R   R   R   RF   R?   R   R   R   R   R   R   R   R   RT   R_   Rb   R   (   R'   R   R   R   R   R~   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR   R  s.    				
(	   R   R   R   R)   R   R   R   R   R   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s   			+	
	
t   InputSitemapc           B   s   e  Z d  Z d e f d     YZ d e f d     YZ d e f d     YZ d e f d     YZ d	 e f d
     YZ d e f d     YZ	 d   Z
 d   Z d   Z d   Z d   Z d   Z d   Z RS(   s  
  Each Input class knows how to yield a set of URLs from a data source.

  This one handles Sitemap files and Sitemap index files.  For the sake
  of simplicity in design (and simplicity in interfacing with the SAX
  package), we do not handle these at the same time, recursively.  Instead
  we read an index file completely and make a list of Sitemap files, then
  go back and process each Sitemap.
  t   _ContextBasec           B   sD   e  Z d  Z d   Z d   Z d   Z d   Z d   Z d   Z RS(   st  Base class for context handlers in our SAX processing.  A context
    handler is a class that is responsible for understanding one level of
    depth in the XML schema.  The class knows what sub-tags are allowed,
    and doing any processing specific for the tag we're in.

    This base class is the API filled in by specific context handlers,
    all defined below.
    c         C   s   | |  _  d |  _ d S(   sU   Initialize with a sequence of the sub-tags that would be valid in
      this context.N(   t   _allowed_tagsR   t	   _last_tag(   R'   t   subtags(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    	c         C   s.   | |  j  k } | r! | |  _ n	 d |  _ | S(   s<   Returns True iff opening a sub-tag is valid in this context.N(   R   R   R   (   R'   t   tagt   valid(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt	   AcceptTag  s
    	c         C   s   t  S(   s:   Returns True iff a blurb of text is valid in this context.(   R   (   R'   R7   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt
   AcceptText  s    c         C   s   d S(   s+   The context is opening.  Do initialization.N(    (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Open  s    c         C   s   d S(   s3   The context is closing.  Return our result, if any.N(    (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Close  s    c         C   s   | r t   n  d S(   s
  We're returning to this context after handling a sub-tag.  This
      method is called with the result data from the sub-tag that just
      closed.  Here in _ContextBase, if we ever see a result it means
      the derived child class forgot to override this method.N(   t   NotImplementedError(   R'   R8   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Return  s    (	   R   R   R   R)   R  R  R  R  R  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s   			
			t   _ContextUrlSetc           B   s   e  Z d  Z d   Z RS(   s3   Context handler for the document node in a Sitemap.c         C   s   t  j j |  d  d  S(   NR   (   s   url(   R   R   R)   (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    (   R   R   R   R)   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s   t   _ContextUrlc           B   s2   e  Z d  Z d   Z d   Z d   Z d   Z RS(   s,   Context handler for a URL node in a Sitemap.c         C   s,   t  j j |  t j  d |  _ | |  _ d S(   sU   Initialize this context handler with the callable consumer that
      wants our URLs.N(   R   R   R)   RT   R   R   R   t	   _consumer(   R'   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    	c         C   s    |  j  s t  t   |  _  d S(   s   Initialize the URL.N(   R   Rz   RT   (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s    c         C   s/   |  j  s t  |  j |  j  t  d |  _  d S(   s2   Pass the URL to the consumer and reset it to None.N(   R   Rz   R
  R   R   (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s    c         C   s2   |  j  s t  | r. |  j  j |  j |  n  d S(   s7   A value context has closed, absorb the data it gave us.N(   R   Rz   R_   R   (   R'   R8   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s    (   R   R   R   R)   R  R  R  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR	    s
   			t   _ContextSitemapIndexc           B   s2   e  Z d  Z d   Z d   Z d   Z d   Z RS(   s7   Context handler for the document node in an index file.c         C   s    t  j j |  d  g  |  _ d  S(   NR   (   s   sitemap(   R   R   R)   t   _loclist(   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    c         C   s   |  j  s t  d S(   s   Just a quick verify of state.N(   R  Rz   (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s    c         C   s#   |  j  r |  j  } g  |  _  | Sd S(   s$   Return our list of accumulated URLs.N(   R  (   R'   R9   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s    			c         C   s   | r |  j  j |  n  d S(   s0   Getting a new loc URL, add it to the collection.N(   R  R0   (   R'   R8   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s    (   R   R   R   R)   R  R  R  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s
   			t   _ContextSitemapc           B   s2   e  Z d  Z d   Z d   Z d   Z d   Z RS(   s5   Context handler for a Sitemap entry in an index file.c         C   s    t  j j |  d  d  |  _ d  S(   NRU   RV   (   s   locs   lastmod(   R   R   R)   R   t   _loc(   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    c         C   s   |  j  s t  d S(   s   Just a quick verify of state.N(   R  Rz   (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s    c         C   s0   |  j  r |  j  } d |  _  | St j d  d S(   s   Return our URL to our parent.s:   In the Sitemap index file, a "sitemap" entry had no "loc".N(   R  R   R3   R4   (   R'   R9   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s
    			c         C   s%   | r! |  j  d k r! | |  _ n  d S(   s2   A value has closed.  If it was a 'loc', absorb it.RU   N(   R   R  (   R'   R8   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s    (   R   R   R   R)   R  R  R  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s
   				t   _ContextValuec           B   s2   e  Z d  Z d   Z d   Z d   Z d   Z RS(   s   Context handler for a single value.  We return just the value.  The
    higher level context has to remember what tag led into us.c         C   s    t  j j |  d  d  |  _ d  S(   N(    (   R   R   R)   R   t   _text(   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)   %  s    c         C   s)   |  j  r |  j  | |  _  n	 | |  _  t S(   s(   Allow all text, adding it to our buffer.(   R  Rb   (   R'   R7   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR  *  s    		c         C   s   d |  _ d S(   s   Initialize our buffer.N(   R   R  (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR  3  s    c         C   s+   |  j  } d |  _  | r' | j   } n  | S(   s   Return what's in our buffer.N(   R  R   R   (   R'   R7   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR  8  s
    		(   R   R   R   R)   R  R  R  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s
   				c         C   s   t  j j j j |   d |  _ d |  _ d |  _ d |  _	 d |  _
 t d | d g  s\ d S| j d  } | r t j |  } t j j |  r t j d | d  | g |  _ q t j d |  n t j d  d S(	   sQ   Initialize with a dictionary of attributes from our entry in the
    config file.it   SITEMAPR   Ns   Input: From SITEMAP "%s"i   s   Can not locate file "%s"s-   Sitemap entries must have a "path" attribute.(   R   R   t   handlert   ContentHandlerR)   R   t	   _pathlistt   _currentt	   _contextst   _contexts_idxt   _contexts_stmR   R   RF   R;   R   R   R   R3   RH   R   (   R'   R   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)   B  s     					c         C   s   t  j   t  j   t  j   g |  _ t  j   t  j |  t  j   g |  _ |  j sZ t	  |  j d } d |  _ |  j |  |  j |  _ x" |  j d D] } |  j |  q Wd S(   s  In general: Produces URLs from our data source, hand them to the
    callable consumer.

    In specific: Iterate over our list of paths and delegate the actual
    processing to helper methods.  This is a complexity no other data source
    needs to suffer.  We are unique in that we can have files that tell us
    to bring in other files.

    Note the decision to allow an index file or not is made in this method.
    If we call our parser with (self._contexts == None) the parser will
    grab whichever context stack can handle the file.  IE: index is allowed.
    If instead we set (self._contexts = ...) before parsing, the parser
    will only use the stack we specify.  IE: index not allowed.
    i    i   N(   R   R  R  R  R  R  R	  R  R  Rz   R   R  t   _ProcessFile(   R'   R   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR   \  s    				c         C   s   | s t   t | d  \ } } | s+ d Sy  d |  _ t j j | |   Wn t k
 ro t j d |  nf t	 k
 r t j d |  nE t j j
 j k
 r } t j d | | j | j | j   f  n X| j   | r | j   n  d S(   sB   Do per-file reading/parsing/consuming for the file path passed in.R  Nis8   An error in file "%s" made us abort reading the Sitemap.s   Cannot read from file "%s"s3   XML error in the file "%s" (line %d, column %d): %s(   Rz   R   R  R   R   t   parseR   R3   R   t   IOErrort   _exceptionst   SAXParseExceptiont   _linenumt   _colnumt
   getMessageR   (   R'   R   R   R   t   e(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s$    			$
c         C   s:  |  j  s t  |  j  d } t j j |  } t j j |  } t } t |  t j	 k rd t
 } n  x | D] } t j |  } t j d | d  t j |  \ } } } } }	 t j j |  }
 t j |
  }
 | r t j |
  }
 n  | r| t j |
 }
 n  |
 rk |  j  j |
  t j d |
 d  qk qk Wd S(   sq  Given a list of URLs, munge them into our self._pathlist property.
    We do this by assuming all the files live in the same directory as
    the first file in the existing pathlist.  That is, we assume a
    Sitemap index points to Sitemaps only in the same directory.  This
    is not true in general, but will be true for any output produced
    by this script.
    i    s#   Index points to Sitemap file at: %si   s%   Will attempt to read Sitemap file: %si   N(   R  Rz   R   R   t   normpatht   dirnameR   R,   R-   R.   Rb   RT   R[   R3   RH   R`   Ra   t   basenameRk   t   unquoteRF   R?   R   R0   (   R'   t   urllistR   t   dirt   wideR   Rd   Re   Rf   Rg   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   _MungeLocationListIntoFiles  s(    	c         C   s  |  j  d k  r |  j r |  j r+ |  j s1 t  | d k rL |  j |  _ q | d k rw |  j |  _ t j d d  q t j d  t  n  |  j  d k  r |  j |  j k r | d k r t j d  t  n  | r{d } xy | j	   D]k } |  j  d k  r7| j
 d	  d k rq n  | j
 d
  d k r7q q7n  | rJ| d } n  | | } q W| r{t j d | | f  q{n  |  j  d k  s|  j |  j  j |  r|  j  d |  _  |  j  t |  j  k  st  |  j |  j  j   n t j d |  t  d S(   s   SAX processing, called per node in the config stream.
    As long as the new tag is legal in our current context, this
    becomes an Open call on one context deeper.
    i    t   urlsett   sitemapindexs   File is a Sitemap index.i   sA   The document appears to be neither a Sitemap nor a Sitemap index.s7   A Sitemap index can not refer to another Sitemap index.R   t   xmlnst   xsis   , sM   Did not expect any attributes on any tag, instead tag "%s" had attributes: %si   s)   Can not accept tag "%s" where it appears.N(   R  R  R  R  Rz   R3   RH   R   R   R   t   findR4   R  Rq   R  (   R'   R   R   R7   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   startElement  sB    	!			(c         C   s   | } |  j  d k s t  |  j |  j  j   } |  j  d |  _  |  j  d k rj |  j |  j  j |  n( | r |  j |  j k r |  j |  n  d S(   s   SAX processing, called per node in the config stream.
    This becomes a call to Close on one context followed by a call
    to Return on the previous.
    i    i   N(   R  Rz   R  R  R  R  R)  (   R'   R   t   retval(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt
   endElement  s    c         C   sV   |  j  d k  s) |  j |  j  j |  rR | j   rR t j d |  t  qR n  d S(   s   SAX processing, called when text values are read.  Important to
    note that one single text value may be split across multiple calls
    of this method.
    i    s*   Can not accept text "%s" where it appears.N(   R  R  R  R   R3   R   R   (   R'   R7   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt
   characters  s
    (   R   R   R   t   objectR   R  R	  R  R  R  R)   R   R  R)  R/  R1  R2  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR   z  s   
5	!"		$			3	t   FilePathGeneratorc           B   s;   e  Z d  Z d   Z d   Z d   Z d   Z d   Z RS(   s^  
  This class generates filenames in a series, upon request.
  You can request any iteration number at any time, you don't
  have to go in order.

  Example of iterations for '/path/foo.xml.gz':
    0           --> /path/foo.xml.gz
    1           --> /path/foo1.xml.gz
    2           --> /path/foo2.xml.gz
    _index.xml  --> /path/foo_index.xml
  c         C   s(   t  |  _ d  |  _ d  |  _ d  |  _ d  S(   N(   R   t   is_gzipR   R   t   _prefixt   _suffix(   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s    			c         C   s  t  j |  } t j j |  } t j j |  j   } | sT t j d |  t	 St
 |  } d } d d d g } x- | D]% } | j |  r| t
 |  } Pq| q| W| s t j d |  t	 S| j d  |  _ t
 |  } | | |  |  _ | | | | | !|  _ | | | |  _ t S(   s6    Splits up a path into forms ready for recombination. s    Couldn't parse the file path: %si    s   .xmls   .xml.gzs   .gzs8   The path "%s" doesn't end in a supported file extension.(   RF   R;   R   R   R"  R$  R   R3   R   R   Rq   R   R5  R   R6  R7  Rb   (   R'   R   t   baset   lenbaset	   lensuffixt   compare_suffixt   suffixt   lenpath(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Preload   s.    	c         C   sR   |  j  |  j } t |  t j k rJ | r? d | | |  j f S| |  j S| | S(   s/    Generates the iterations, as described above. s   %s%d%s(   R   R6  R,   R-   RP   R7  (   R'   t   instanceR   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   GeneratePathB  s    c         C   sn   | |  j  } d } t |  t j k rW | rG d | | |  j f } qa | |  j } n
 | | } t j |  S(   s7    Generates iterations, but as a URL instead of a path. s   %s%d%sN(   R6  R   R,   R-   RP   R7  RT   R[   (   R'   R?  t   root_urlR   R0  (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   GenerateURLL  s    
c         C   sH   t  j | |  j  } t  j | |  j  } | t |  } | d | S(   s;    Generates a wildcard that should match all our iterations t   *(   RT   R[   R6  R7  Rq   (   R'   RA  R   R9   R<  (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   GenerateWildURLZ  s    (   R   R   R   R)   R>  R@  RB  RD  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR4    s   		"	
	t   PerURLStatisticsc           B   s)   e  Z d  Z d   Z d   Z d   Z RS(   sD    Keep track of some simple per-URL statistics, like file extension. c         C   s   i  |  _  d  S(   N(   t   _extensions(   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)   g  s    c   	      C   s  | r| j  rt j | j   \ } } } } } | s: d S| j d  r |  j j d  rv |  j d d |  j d <n d |  j d <d S| j d  } | d k r | t |  k  s t  | | } n  | j d  } | d k rG| t |  k  s t  | | j	   } |  j j |  r7|  j | d |  j | <qd |  j | <q|  j j d  rt|  j d d |  j d <qd |  j d <n  d S(   sC    Log some stats for the URL.  At the moment, that means extension. NR   i   i    t   .s   (no extension)(
   RU   R`   Ra   R   RF  RL   t   rfindRq   Rz   R   (	   R'   R   Rd   Re   R   Rf   Rg   R   t   ext(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Consumek  s0    !c         C   sq   t  |  j  rm t j d d  |  j j   } | j   x2 | D]' } t j d |  j | | f d  q? Wn  d S(   s    Dump out stats to the output. s!   Count of file extensions on URLs:i   s    %7d  %sN(   Rq   RF  R3   RH   R   t   sort(   R'   t   setRI  (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRH     s    
(   R   R   R   R)   RJ  RH   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRE  d  s   		&t   Sitemapc           B   sh   e  Z d  Z d   Z d   Z d   Z d   Z d   Z d   Z d   Z	 d   Z
 d	   Z d
   Z RS(   s   
  This is the big workhorse class that processes your inputs and spits
  out sitemap files.  It is built as a SAX handler for set up purposes.
  That is, it processes an XML stream to bring itself up.
  c         C   s   t  j j j j |   g  |  _ g  |  _ i  |  _ g  |  _ d  |  _
 d  |  _ d  |  _ d |  _ d |  _ t   |  _ t |  _ t |  _ d  |  _ d  |  _ d  |  _ | |  _ d  S(   Ni    i   (   R   R   R  R  R)   t   _filterst   _inputst   _urlst   _setR   t   _filegent	   _wildurl1t	   _wildurl2t	   _sitemapst   _dup_maxRE  t   _statR   t   _in_sitet   _in_Site_evert   _default_enct	   _base_urlt   _store_intot	   _suppress(   R'   t   suppress_notify(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR)     s"    														c         C   s  t  } |  j r" t j |  j  n  | rH |  j rH t j d  t } n  | r~ t j	 |  j  r~ t j d |  j  t } n  | r t j
 |  j  |  _ |  j j d  s |  j d |  _ n  t j d |  j d  n  | r*|  j rt   |  _ |  j j |  j  s't } q'q*t j d  t } n  | rf|  j j |  j  |  _ |  j j t |  j  |  _ n  | r|  j rt |  j  t j k st |  j  t j k r|  j d k s|  j j   d k rt |  _ qqqn  | st j d	 d
  n  | S(   s?    Verifies (and cleans up) the basic user-configurable options. s$   A site needs a "base_url" attribute.s1   The "base_url" must be absolute, not relative: %sR   s   BaseURL is set to: %si   s&   A site needs a "store_into" attribute.t   0t   falses.   See "example_config.xml" for more information.i    (   Rb   RZ  RF   R*   R[  R3   R   R   RT   Rh   R[   R   RH   R\  R4  RR  R>  RD  RS  RB  t   SITEINDEX_SUFFIXRT  R]  R,   R-   R<   R.   R   (   R'   t   all_good(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   ValidateBasicConfig  sF    							$$c         C   s   x! |  j  D] } | j |  j  q
 Wt |  j  r@ |  j   n  |  j sc t j d  |  j   n  |  j d k r |  j	   n  |  j
   |  j j   d S(   s1    Run over all the Inputs and ask them to Produce s0   No URLs were recorded, writing an empty sitemap.i   N(   RO  R   t
   ConsumeURLRq   RQ  t   FlushSetRU  R3   R4   t
   WriteIndext   NotifySearchRW  RH   (   R'   t   input(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   Generate  s    	
c         C   s  | s
 d S| j  |  j |  s# d Sd } x0 |  j D]% } | j |  } | d k r3 Pq3 q3 W| pk | d k s | j d d d d  d St j | j |  j	  s t j | j |  j
  r | j d d d d  d S| j   } |  j j |  rR|  j | } | d k r>| d } | |  j | <|  j | k  r>| |  _ q>n  | j d d	  d Sd |  j | <|  j j |  |  j j |  | j   t |  j  t k r|  j   n  d S(
   s   
    All per-URL processing comes together here, regardless of Input.
    Here we run filters, remove duplicates, spill to disk as needed, etc.
    NR   t   FILTEREDRG   i   s   IGNORED (output file)i    i   t	   DUPLICATE(   R   R[  R   RN  R   RH   R   R   RU   RS  RT  R   RP  RL   RV  RQ  R0   RW  RJ  Rq   t   MAXURLS_PER_SITEMAPRe  (   R'   R   R   t   acceptt   filterRM   t   dup(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRd    s@    

c         C   s  t  j d d  |  j j   xr |  j D]g } | j   } |  j | } | d k r' d |  j | <| j s d t |  t |  j  | _ q q' q' W|  j	 j
 |  j  } | s t  j d  n  |  j d |  _ t  j d | t |  j  f d  d } d } y |  j	 j rMt j j |  } t | d  } t j d	 | d
 | d d  } n t | d  } | j t  x |  j D] } | j |  qsW| j t  | j   | r| j   n  d } d } Wn" t k
 rt  j d |  n Xt j | d  g  |  _ d S(   s   
    Flush the current set of URLs to the output.  This is a little
    slow because we like to sort them all and normalize the priorities
    before dumping.
    s'   Sorting and normalizing collected URLs.i   i    is   %.4fs.   Unexpected: Couldn't generate output filename.s&   Writing Sitemap file "%s" with %d URLst   wbt   fileobjt   filenamet   modet   wts   Couldn't write out to file: %si  N(   R3   RH   RQ  RK  R   RP  RX   R   RV  RR  R@  RU  RO   Rq   R   R5  R   R   R$  t   opent   gzipt   GzipFileR   t   SITEMAP_HEADERR   t   SITEMAP_FOOTERR   R  t   chmod(   R'   R   RM   Ro  Rr  R   R   R$  (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRe  6  sH    	*	!

c         C   s'  |  j  j t  } | s( t j d  n  t j d | |  j f d  t t j    } y t	 | d  } | j
 t  xW t d |  j  D]C } |  j  j | |  j  } i | d 6| d 6} | j
 t |  q W| j
 t  | j   d
 } Wn" t k
 rt j d |  n Xt j | d	  d
 S(   s-    Write the master index of all Sitemap files s4   Unexpected: Couldn't generate output index filename.s(   Writing index file "%s" with %d Sitemapsi   Rt  i    RU   RV   s   Couldn't write out to file: %si  N(   RR  R@  Ra  R3   RO   RH   RU  R   R   Ru  R   t   SITEINDEX_HEADERR   RB  R[  t   SITEINDEX_ENTRYt   SITEINDEX_FOOTERR   R   R  R   Rz  (   R'   Rr  RV   t   fdt	   mapnumbert   mapurlt   mapattributes(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRf  o  s&    	

c   
      C   s  |  j  r t j d d  d St j d d  d t j f d     Y} t j } |   t _ |  j d k r |  j j t	 |  j
  } n |  j j d |  j
  } y t j |  } | j   Wn/ t k
 r t j d |  t j d	  n Xx t D] } | d
 } | d } | | | <t j |  } t j | d | d | d | | d f  }	 t j d | d d  t j d |	 d  y' t j |	  } | j   | j   Wq t k
 rt j d | d  q Xq W| r| t _ n  d S(   s@    Send notification of the new Sitemap(s) to the search engines. s)   Search engine notification is suppressed.i   Ns   Notifying search engines.t   ExceptionURLopenerc           B   s   e  Z d    Z RS(   c         S   s$   t  j d | | f d  t  d  S(   Ns   HTTP error %d: %si   (   R3   RH   R  (   R'   R   t   fpt   errcodet   errmsgt   headers(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   http_error_default  s    (   R   R   R  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s   i    s   When attempting to access our generated Sitemap at the following URL:
    %s
  we failed to read it.  Please verify the store_into path you specified in
  your configuration file is web-accessable.  Consult the FAQ for more
  information.s.   Proceeding to notify with an unverifyable URL.i   i   i   i   s   Notifying: %ss   Notification URL: %ss   Cannot contact: %s(   R]  R3   RH   Rk   t   FancyURLopenert
   _urlopenerRU  RR  RB  Ra  R[  t   urlopenR   R  R   R4   t   NOTIFICATION_SITESt	   urlencodeR`   Ro   t   read(
   R'   R  t
   old_openerR   t   ut   pingt	   query_mapt
   query_attrRf   t   notify(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRg    sB    			


.
c         C   s  | d k r |  j  r% t j d  qt |  _  t d | d  sD d	 S| j d d
  } | rl t j |  n  | j d  |  _ | j d  |  _ | j d  |  _	 |  j
 s | j d t  |  _
 n  |  j   nC| d k r |  j j t |   n| d k r|  j j t |   n | d k rYx t | d  D] } |  j j t |   q6Wn | d k r|  j j t | |  j   n | d k rx t | d  D] } |  j j t |   qWnP | d k rxA t | d  D] } |  j j t |   qWn t j d |  d	 S(   s7    SAX processing, called per node in the config stream. t   sites/   Can not nest Site entries in the configuration.t   SITEt   verboset   default_encodingR   t
   store_intot   suppress_search_engine_notifyNi    Rn  R   R&  R   t	   directoryt	   accesslogR   s)   Unrecognized tag in the configuration: %s(   s   verboseR  s   base_urlR  R  (   RX  R3   R   Rb   R   R   RS   RZ  R[  R\  R]  R   Rc  RN  R0   R   RO  R   t   ExpandPathAttributeR   R   R   R   (   R'   R   R   R  t   attributeset(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR/    sD    			 			c         C   s4   | d k r0 |  j  s t  t |  _  t |  _ n  d S(   s7    SAX processing, called per node in the config stream. R  N(   RX  Rz   R   Rb   t   _in_site_ever(   R'   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR1    s    	c         C   s6   |  j  s t j d  n |  j s2 t j d  n  d S(   s$    End of SAX, verify we can proceed. s0   The configuration must specify a "site" element.s0   There were no inputs to generate a sitemap from.N(   R  R3   R   RO  R4   (   R'   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   endDocument  s    		(   R   R   R   R)   Rc  Ri  Rd  Re  Rf  Rg  R/  R1  R  (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyRM    s   		3		3	9	 	;	3	c         C   sM   t  } x@ | j   D]2 } | | k r t j d |  | f  t } q q W| S(   s]    Makes sure 'attributes' does not contain any attribute not
      listed in 'goodattributes' s   Unknown %s attribute: %s(   Rb   R   R3   R   R   (   R   R   t   goodattributesRb  R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR     s    c         C   s   |  j  |  } | s |  g St j |  } t j |  } | sG |  g St |   t j k r i  } x" |  j   D] } |  | | | <qo W| }  n  g  } x1 | D]) } |  j   } | | | <| j	 |  q W| S(   s    Given a dictionary of attributes, return a list of dictionaries
      with all the same attributes except for the one named attrib.
      That one, we treat as a file path and expand into all its possible
      variations. (
   R   RF   R;   t   globR,   R-   t   DictionaryTypeR   t   copyR0   (   t   srct   attribR   t   pathlistt   tmpt   keyR0  t   dst(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR    s$    	
c         C   s   d
 } d
 } |  s | | f Sy |  j d  rX t |  d  } t j d | d d  } n t |  d  } | r t j d | |  f d  n t j d |  d  Wn" t k
 r t j d	 |   n X| | f S(   s(    Opens a text file, be it GZip or plain s   .gzt   rbRq  Rs  t   rts   Opened %s file: %si   s   Opened file: %ss   Can not open file: %sN(	   R   R   Ru  Rv  Rw  R3   RH   R  R   (   R   t   logtextR   R   (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR   :  s    
c         C   s   t  j d t  j |    S(   s:   Seconds since epoch (1970-01-01) --> ISO 8601 time string.s   %Y-%m-%dT%H:%M:%SZ(   R   t   strftimet   gmtime(   t   t(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyR   T  s    c         C   s   t  j } t |  } y+ t  j d |  d  t j j |  |  Wn t k
 rd t  j d |   nh t j j	 j
 k
 r } t  j d | j | j | j   f  n' t j j	 j k
 r t  j d  n X| t  j k r | Sd S(   sF    Sets up a new Sitemap object from the specified configuration file.  s   Reading configuration file: %si    s"   Cannot read configuration file: %ss5   XML error in the config file (line %d, column %d): %ss   Some installs of Python 2.2 did not include complete support for XML.
  Please try upgrading your version of Python and re-running the script.N(   R3   RA   RM  RH   R   R   R  R  R   R  R  R  R  R   t   SAXReaderNotAvailableR   (   t
   configpathR^  RA   R   R!  (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   CreateSitemapFromFileY  s    		 c         C   s   i  } d } d } d | d | d } t  j |  } x |  D]| } ya | j |  j   } | j d  r | d | | d <n  | j d  r | d | | d <n  Wq> t k
 r d	 SXq> W| S(
   s   
  Parse command line flags per specified usage, pick off key, value pairs
  All flags of type "--key=value" will be processed as __flags[key] = value,
                    "--option" will be processed as __flags[option] = option
  s   --(?P<key>\S*)[=](?P<value>\S*)s   --(?P<option>\S*)t   (s   )|(t   )R  R^   t   optionN(   R   R   R   t	   groupdictRL   R   R   (   t   argst   flagst   rkeyvalt   roptiont   rt   rct   at   rcg(    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   ProcessCommandFlagst  s    	t   __main__t   configt   helpt   testings%   Configuration file errors -- exiting.s   Number of errors: %ds   Number of warnings: %d(O   t	   __usage__R   t
   hexversiont   versionRN   R   R  Rv  RI   R   R   R   R   R-   Rk   R`   t   xml.saxR   Rb   t   testTruet	   NameErrorR   R6   R5   Rn   R$   R&   Rl  Ra  R   R   t   mapR}   R   R{  R}  R|  Rx  Ry  R   R   R  t	   ExceptionR   R   R   RF   R@   R3   R3  RT   R   R   R   R   R   R   R  R  R   R4  RE  RM  R   R  R   R   R  R  R   t   argvR  RL   RH   R^  R   Ri  RA   RB   (    (    (    s2   /home/docdiana/docdiana.com/sitemap/sitemap_gen.pyt   <module>3   s   
				Q	R%J Y7 v						&
