#!/usr/local/bin/ruby # # This script collects paper titles from the ICML conferences over the last 20 years and # computes some term frequency statistics. It does not require any arguments or configuration # as it gathers what it needs from the web (including a stop word list). The resulting term # counts are written to `term_counts.csv` in the working directory. # # The titles for each conference are also written to file as they are scraped from the web. # This are written to `icmlYYYY.txt` in the working directory for each year YYYY of the ICML # conference. # # AUTHOR: Mark Reid # CREATED: 25th June 2007 # MODIFIED: 1st July 2007 require 'rubygems' require 'net/http' require 'uri' require 'mechanize' require 'stemmer' # Output file OUTPUT_FILE = 'term_counts.csv' # Locations of the raw ICML paper title data ICML_DBLP = 'http://www.informatik.uni-trier.de/~ley/db/conf/icml/' ICML_2007 = 'http://oregonstate.edu/conferences/icml2007/paperlist.html' # Gets a list of stop words from the given URI STOP_WORDS_URI = URI.parse 'http://www.dcs.gla.ac.uk/idom/ir_resources/linguistic_utils/stop_words' STOP_WORDS = Net::HTTP.get(STOP_WORDS_URI).split("\n") # Stores the title of a paper and the salient terms in the title class Paper attr_reader :title attr_reader :terms # Removes stop words, punctuation, lowercases and stems the title # and saves a list of processed terms. def initialize(title) @title = title.strip @terms = title.split(/\W/).collect { |word| word.downcase.strip } @terms.reject! { |word| word.empty? || STOP_WORDS.include?(word) } @terms.map! { |word| word.stem } @terms.uniq! end end # Stores paper titles and word stats regarding a single ICML conference. class Conference attr_reader :year attr_reader :counts attr_reader :papers # Creates a new conference with the given year def initialize(year) @year = year @counts = Hash.new(0) @papers = [] end # Adds the paper to this conference, performing stemming and stop-word # removal to update keywords stats. def <<(title) @papers << Paper.new(title) @papers.last.terms.each do |term| @counts[term] += 1 end end # Writes the paper titles (one per line) for this conference to a file called # `icmlYYYY.txt` where `YYYY` is the year of this conference. def save File.open("icml#{year}.txt", 'w') do |file| papers.each { |paper| file.puts paper.title } end end # Computes the number of papers that contain the given word in their title. def count(term) count = 0 papers.each { |paper| count += 1 if paper.terms.include?(term) } return count end # Collects all the terms used in papers in this conference def terms ts = [] papers.each { |paper| ts += paper.terms } ts.uniq end end # Initialise conference list, term list and scraping agent confs = [] terms = [] agent = WWW::Mechanize.new # Scrape the ICML 2007 data from the OSU site, saving titles to file confs << Conference.new('2007') agent.get(ICML_2007).search('//a[@name]').each do |a| confs.last << a.inner_text end confs.last.save puts "Scraped #{confs.last.papers.length} papers for ICML 2007." # Scrape the other ICML data from the DBLP site, saving the titles to file agent.get(ICML_DBLP).links.text('Contents').each do |link| year = /\d{4}/.match(link.href)[0] confs << Conference.new(year) page = agent.click link page.search('//li').each do |li| confs.last << $1 if li.inner_text.gsub('\n','') =~ /:([^.]+)\./ end confs.last.save puts "Scraped #{confs.last.papers.length} papers for ICML #{year}." end # Collect all the terms used over all conferences and compute the fraction of # papers in each conference that contained each term. File.open(OUTPUT_FILE, 'w') do |freqs| # Header with conference years freqs.puts((['term'] + confs.map { |conf| conf.year }).join(',')) # Total paper count for each conference freqs.puts((['paper_count'] + confs.map { |conf| conf.papers.length }).join(',')) # Term frequencies per conference confs.map { |conf| conf.terms }.flatten.uniq.sort.each do |term| freqs.puts(([term] + confs.map { |conf| conf.count(term) }).join(',')) end end