Class | Yapra::Plugin::Filter::EntryFullText |
In: |
lib-plugins/yapra/plugin/filter/entry_full_text.rb
|
Parent: | Yapra::Plugin::MechanizeBase |
Filter::EntryFullText — Yuanying
get the entry full text from page with WWW::Mechanize.
- module: Filter::EntryFullText config: regexp: http://www\.pixiv\.net/* extract_xpath: title: '//title/text()' dc_creator: "//div[@id='profile']/div/text()" author: "//div[@id='profile']/div/text()" description: "//div[@id='content2']" apply_template_after_extracted: content_encoded: '<div><%= title %></div>'
# File lib-plugins/yapra/plugin/filter/entry_full_text.rb, line 20 20: def run(data) 21: regexp = nil 22: if config['regexp'] 23: regexp = Regexp.new(config['regexp']) 24: else 25: regexp = /^(https?|ftp)(:\/\/[-_.!~*\'()a-zA-Z0-9;\/?:\@&=+\$,%#]+)$/ 26: end 27: 28: wait = config['wait'] || 1 29: 30: data.map! do |item| 31: url = item 32: if item.respond_to?('link') 33: url = item.link 34: end 35: 36: if regexp =~ url 37: logger.debug "Process: #{url}" 38: page = agent.get(url) 39: sleep wait 40: 41: unless(item.instance_of?(RSS::RDF::Item)) 42: new_item = RSS::RDF::Item.new 43: new_item.title = item.title rescue item.to_s 44: new_item.date = item.date rescue Time.now 45: new_item.description = item.description rescue item.to_s 46: new_item.link = item.link rescue '#' 47: item = new_item 48: end 49: 50: extract_attribute_from page.root, item, binding 51: 52: end 53: item 54: end 55: 56: data 57: end