diff options
author | root <root@wlan-5-141.nay.redhat.com> | 2011-07-05 19:05:18 +0800 |
---|---|---|
committer | root <root@wlan-5-141.nay.redhat.com> | 2011-07-05 19:05:18 +0800 |
commit | a69dc47796390b0238fb105686b4a587bc25ce8e (patch) | |
tree | e5d4f70951fce782a5ac0c0d9b1444d983e02173 /wiki_to_nitrate_xml.py | |
parent | 81c5ae716d177cc18fa36d63cb7d5c7435b199f9 (diff) | |
download | repo-a69dc47796390b0238fb105686b4a587bc25ce8e.tar.gz repo-a69dc47796390b0238fb105686b4a587bc25ce8e.tar.xz repo-a69dc47796390b0238fb105686b4a587bc25ce8e.zip |
modifying the links in the contents to have full url addresses
Diffstat (limited to 'wiki_to_nitrate_xml.py')
-rwxr-xr-x | wiki_to_nitrate_xml.py | 12 |
1 files changed, 10 insertions, 2 deletions
diff --git a/wiki_to_nitrate_xml.py b/wiki_to_nitrate_xml.py index c962620..b9d36d1 100755 --- a/wiki_to_nitrate_xml.py +++ b/wiki_to_nitrate_xml.py @@ -7,6 +7,7 @@ import re import time, datetime import xml.etree.ElementTree as ET import xml.dom.minidom +from lxml import html from BeautifulSoup import BeautifulSoup try: @@ -101,14 +102,21 @@ def list_categorymembers(wiki, cat_page, limit=5): return members +def repl(link): + '''add full url addresses to the links which only have paths.''' + if link.startswith('/'): + link = 'http://fedoraproject.org' + link + return link + def extract_to_dict(string, titles): '''extract wiki contents in html format and cache to table''' - s_text = string.get('text',{}).get('*','') s_tag = string.get('categories',{}) tag = [] for t in s_tag: tag.append(t.get('*','')) - soup = BeautifulSoup(''.join(s_text)) + s_text = string.get('text',{}).get('*','') + s_text_polished = html.rewrite_links(s_text, repl) + soup = BeautifulSoup(''.join(s_text_polished)) table = {} table['title'] = titles if soup.find(id='Description') == None: |