modifying the links in the contents to have full url addresses

author: root <root@wlan-5-141.nay.redhat.com> 2011-07-05 19:05:18 +0800
committer: root <root@wlan-5-141.nay.redhat.com> 2011-07-05 19:05:18 +0800
commit: a69dc47796390b0238fb105686b4a587bc25ce8e (patch)
tree: e5d4f70951fce782a5ac0c0d9b1444d983e02173 /wiki_to_nitrate_xml.py
parent: 81c5ae716d177cc18fa36d63cb7d5c7435b199f9 (diff)
download: repo-a69dc47796390b0238fb105686b4a587bc25ce8e.tar.gz
repo-a69dc47796390b0238fb105686b4a587bc25ce8e.tar.xz
repo-a69dc47796390b0238fb105686b4a587bc25ce8e.zip
1 files changed, 10 insertions, 2 deletions
diff --git a/wiki_to_nitrate_xml.py b/wiki_to_nitrate_xml.py
index c962620..b9d36d1 100755
--- a/wiki_to_nitrate_xml.py
+++ b/wiki_to_nitrate_xml.py
@@ -7,6 +7,7 @@ import re
 import time, datetime
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
+from lxml import html
 from BeautifulSoup import BeautifulSoup
 
 try:
@@ -101,14 +102,21 @@ def list_categorymembers(wiki, cat_page, limit=5):
 
     return members
 
+def repl(link):
+    '''add full url addresses to the links which only have paths.'''
+    if link.startswith('/'):
+	link = 'http://fedoraproject.org' + link
+    return link
+
 def extract_to_dict(string, titles):
     '''extract wiki contents in html format and cache to table'''
-    s_text = string.get('text',{}).get('*','')
     s_tag = string.get('categories',{})
     tag = []
     for t in s_tag:
     	tag.append(t.get('*',''))
-    soup = BeautifulSoup(''.join(s_text))
+    s_text = string.get('text',{}).get('*','')
+    s_text_polished = html.rewrite_links(s_text, repl)
+    soup = BeautifulSoup(''.join(s_text_polished))
     table = {}
     table['title'] = titles
     if soup.find(id='Description') == None:
author	root <root@wlan-5-141.nay.redhat.com>	2011-07-05 19:05:18 +0800
committer	root <root@wlan-5-141.nay.redhat.com>	2011-07-05 19:05:18 +0800
commit	a69dc47796390b0238fb105686b4a587bc25ce8e (patch)
tree	e5d4f70951fce782a5ac0c0d9b1444d983e02173 /wiki_to_nitrate_xml.py
parent	81c5ae716d177cc18fa36d63cb7d5c7435b199f9 (diff)
download	repo-a69dc47796390b0238fb105686b4a587bc25ce8e.tar.gz repo-a69dc47796390b0238fb105686b4a587bc25ce8e.tar.xz repo-a69dc47796390b0238fb105686b4a587bc25ce8e.zip