- You need to export the open directory in XML/RDF format from Open Directory RDF Dump (you should select contnent.rdf.u8.gz).
- Unzip the file, gzip -d contnent.rdf.u8.gz
- You may need to avoid some bad encoding, sed -e 's/&#/#/g' content.rdf.u8 > content.rdf.u8.clean
- Using the attached code, you can extract all the langauges or your favorite one.
- Usage: java DMOZSAXParser <dmoz in RDF> [Langauge]
- Export Arabic URIs: java DMOZSAXParser content.rdf.u8.clean Arabic
- Export All Languages: java DMOZSAXParser content.rdf.u8.clean
Then, we can use Language Detection Library for Java from Cybozu Labs to determine the accuracy of the extracted URI list.
import java.io.IOException; import javax.xml.parsers.*; import org.xml.sax.*; import org.xml.sax.helpers.DefaultHandler; public class DMOZSAXParser extends DefaultHandler{ private String curLanguage=null; private String templink=null; private String requiredLang = null; private void parseDocument(String fileName, String requiredLang) { this.requiredLang = requiredLang; SAXParserFactory spf = SAXParserFactory.newInstance(); try { SAXParser sp = spf.newSAXParser(); sp.parse(fileName, this); }catch(SAXException se) { se.printStackTrace(); }catch(ParserConfigurationException pce) { pce.printStackTrace(); }catch (IOException ie) { ie.printStackTrace(); } } public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { if(qName.equalsIgnoreCase("Topic")) { String category = attributes.getValue(0); if(category != null && category.startsWith("Top/World") && category.indexOf("/", 10)>10 ){ String curVal = category.substring(10, category.indexOf("/", 10)); if(requiredLang==null || requiredLang.equalsIgnoreCase(curVal)){ curLanguage = curVal; } } }else if(qName.equalsIgnoreCase("link")){ templink=attributes.getValue(0); } } public void endElement(String uri, String localName, String qName) throws SAXException { if(curLanguage!= null){ if(qName.equalsIgnoreCase("Topic") ) { curLanguage =null; }else if (qName.equalsIgnoreCase("link")) { System.out.println(curLanguage+"\t"+templink); } } } public static void main(String[] args){ DMOZSAXParser spe = new DMOZSAXParser(); String lang=null; if(args.length > 1){ lang = args[1]; } spe.parseDocument(args[0],lang); } }