Source code for wex.sitemaps

""" Extractors for URLs from 
`/robots.txt <http://en.wikipedia.org/wiki/Robots_exclusion_standard#Sitemap>`_
and `sitemaps <http://www.sitemaps.org/protocol.html>`_.
"""


from __future__ import unicode_literals, absolute_import, print_function
import wex.py2compat ; assert wex.py2compat
from lxml.etree import iterparse
from codecs import getreader
from six.moves.urllib_parse import urljoin
from wex.extractor import chained
from wex.http import decode
from wex.url import URL


[docs]def urls_from_robots_txt(response): """ Yields sitemap URLs from "/robots.txt" """ url = URL(response.request_url or response.url or '') if url.parsed.path != '/robots.txt': return charset = response.headers.get_content_charset() lines = getreader(charset or 'ISO-8859-1')(response) for line in lines: content, _, comment = line.partition('#') field, _, value = content.partition(':') if field.strip().lower() != 'sitemap': continue # we shouldn't need to urljoin but we do just in case joined = URL(urljoin(response.url, value.strip())) # set sitemap=True in fragment to help downstream processing yield "url", joined.update_fragment_dict(sitemap=True)
[docs]def urls_from_urlset_or_sitemapindex(response): """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per `sitemaps.org <http://www.sitemaps.org/protocol.html>`_. """ sitemap = URL(response.url).fragment_dict.get('sitemap') content_subtypes = response.headers.get_content_subtype().split('+') if not sitemap and not 'xml' in content_subtypes: return root = None for _, elem in iterparse(decode(response)): if root is None: root = elem.getroottree().getroot() if not (root.tag.endswith('}sitemapindex') or root.tag.endswith('}urlset')): # root element has wrong tag - give up break if elem.tag.endswith('}loc') and elem.text is not None: text = elem.text.strip() if text: # http://www.sitemaps.org/protocol.html#locdef url = URL(urljoin(response.url, text)) if elem.getparent().tag.endswith('}sitemap'): # set sitemap=True to help downstream processing url = url.update_fragment_dict(sitemap=True) yield "url", url if elem.getparent() is root: # release memory for previous elements while elem.getprevious() is not None: del root[0] #: Extractor that combines :func:`.urls_from_robots_txt` and #: :func:`.urls_from_urlset_or_sitemapindex`.
urls_from_sitemaps = chained(urls_from_robots_txt, urls_from_urlset_or_sitemapindex)