What is the best practice to parse html in swift?

There are several nice libraries of HTML Parsing using Swift and Objective-C like the followings:

Take a look in the following examples in the four libraries posted above, mainly parsed using XPath 2.0:

hpple:

let data = NSData(contentsOfFile: path)
let doc = TFHpple(htmlData: data)

if let elements = doc.searchWithXPathQuery("//a/@href[ends-with(.,'.txt')]") as? [TFHppleElement] {
   for element in elements {
       println(element.content)
   }
}

NDHpple:

let data = NSData(contentsOfFile: path)!
let html = NSString(data: data, encoding: NSUTF8StringEncoding)!
let doc = NDHpple(HTMLData: html)
if let elements = doc.searchWithXPathQuery("//a/@href[ends-with(.,'.txt')]") {
   for element in elements {
     println(element.children?.first?.content)
   }
}

Kanna (Xpath and CSS Selectors):

let html = "<html><head></head><body><ul><li><input type="image" name="input1" value="string1value" class="abc" /></li><li><input type="image" name="input2" value="string2value" class="def" /></li></ul><span class="spantext"><b>Hello World 1</b></span><span class="spantext"><b>Hello World 2</b></span><a href="https://stackoverflow.com/questions/31080818/example.com">example(English)</a><a href="example.co.jp">example(JP)</a></body>"

if let doc = Kanna.HTML(html: html, encoding: NSUTF8StringEncoding) {
   var bodyNode   = doc.body

   if let inputNodes = bodyNode?.xpath("//a/@href[ends-with(.,'.txt')]") {
      for node in inputNodes {
         println(node.contents)
      }
   }
}

Fuzi (Xpath and CSS Selectors):

let html = "<html><head></head><body><ul><li><input type="image" name="input1" value="string1value" class="abc" /></li><li><input type="image" name="input2" value="string2value" class="def" /></li></ul><span class="spantext"><b>Hello World 1</b></span><span class="spantext"><b>Hello World 2</b></span><a href="https://stackoverflow.com/questions/31080818/example.com">example(English)</a><a href="example.co.jp">example(JP)</a></body>"

do {
  // if encoding is omitted, it defaults to NSUTF8StringEncoding
  let doc = try HTMLDocument(string: html, encoding: NSUTF8StringEncoding)

  // XPath queries
  for anchor in doc.xpath("//a/@href[ends-with(.,'.txt')]") {
    print(anchor.stringValue)
  }

} catch let error {
    print(error)
}

The ends-with function is part of Xpath 2.0.

SwiftSoup (CSS Selectors):

do{
    let doc: Document = try SwiftSoup.parse("...")
    let links: Elements = try doc.select("a[href]") // a with href
    let pngs: Elements = try doc.select("img[src$=.png]")

    // img with src ending .png
    let masthead: Element? = try doc.select("div.masthead").first()

    // div with class=masthead
    let resultLinks: Elements? = try doc.select("h3.r > a") // direct a after h3
} catch Exception.Error(let type, let message){
    print(message)
} catch {
   print("error")
}

Ji (XPath):

let jiDoc = Ji(htmlURL: URL(string: "http://www.apple.com/support")!)
let titleNode = jiDoc?.xPath("//head/title")?.first
print("title: \(titleNode?.content)") // title: Optional("Official Apple Support")

I hope this helps you.

Leave a Comment