summaryrefslogtreecommitdiffstats
path: root/wiki_to_nitrate_xml.py
diff options
context:
space:
mode:
authorroot <root@wlan-5-141.nay.redhat.com>2011-07-05 19:05:18 +0800
committerroot <root@wlan-5-141.nay.redhat.com>2011-07-05 19:05:18 +0800
commita69dc47796390b0238fb105686b4a587bc25ce8e (patch)
treee5d4f70951fce782a5ac0c0d9b1444d983e02173 /wiki_to_nitrate_xml.py
parent81c5ae716d177cc18fa36d63cb7d5c7435b199f9 (diff)
downloadrepo-a69dc47796390b0238fb105686b4a587bc25ce8e.tar.gz
repo-a69dc47796390b0238fb105686b4a587bc25ce8e.tar.xz
repo-a69dc47796390b0238fb105686b4a587bc25ce8e.zip
modifying the links in the contents to have full url addresses
Diffstat (limited to 'wiki_to_nitrate_xml.py')
-rwxr-xr-xwiki_to_nitrate_xml.py12
1 files changed, 10 insertions, 2 deletions
diff --git a/wiki_to_nitrate_xml.py b/wiki_to_nitrate_xml.py
index c962620..b9d36d1 100755
--- a/wiki_to_nitrate_xml.py
+++ b/wiki_to_nitrate_xml.py
@@ -7,6 +7,7 @@ import re
import time, datetime
import xml.etree.ElementTree as ET
import xml.dom.minidom
+from lxml import html
from BeautifulSoup import BeautifulSoup
try:
@@ -101,14 +102,21 @@ def list_categorymembers(wiki, cat_page, limit=5):
return members
+def repl(link):
+ '''add full url addresses to the links which only have paths.'''
+ if link.startswith('/'):
+ link = 'http://fedoraproject.org' + link
+ return link
+
def extract_to_dict(string, titles):
'''extract wiki contents in html format and cache to table'''
- s_text = string.get('text',{}).get('*','')
s_tag = string.get('categories',{})
tag = []
for t in s_tag:
tag.append(t.get('*',''))
- soup = BeautifulSoup(''.join(s_text))
+ s_text = string.get('text',{}).get('*','')
+ s_text_polished = html.rewrite_links(s_text, repl)
+ soup = BeautifulSoup(''.join(s_text_polished))
table = {}
table['title'] = titles
if soup.find(id='Description') == None: