Changeset - f59baf5fa7dc
[Not reviewed]
default
0 3 0
Nathan Brink (binki) - 15 years ago 2011-03-25 10:58:11
ohnobinki@ohnopublishing.net
Support crawling a single semester of umich. Closes bug 64.
3 files changed with 344 insertions and 62 deletions:
0 comments (0 inline, 0 general)
inc/class.semester.inc
Show inline comments
 
@@ -144,6 +144,9 @@ class Semester
 
   *   Utility function to add a section to the semester,
 
   *   automatically creating classes as necessary.
 
   *
 
   * Crawler functions should generally use this instead of
 
   * Semester::class_add().
 
   *
 
   * \param $dept
 
   *   The department this section belongs to.
 
   * \param $class
 
@@ -174,6 +177,51 @@ class Semester
 

	
 
  /**
 
   * \brief
 
   *   Add a section_meeting, calling Semester::section_add() as
 
   *   necessary.
 
   *
 
   * To be used by crawlers when parsing data which only presents one
 
   * section_meeting at a time. I.e., when they do tabular data right.
 
   *
 
   * \param $dept
 
   *   The department this section_meeting's course belongs to.
 
   * \param $course
 
   *   The course number this section_meeting's section belongs to.
 
   * \param $title
 
   *   The course title of the given course the section_meeting or
 
   *   NULL.
 
   *   belongs to.
 
   * \param $section
 
   *   The letter or numbers which make up the section's name.
 
   * \param $synonym
 
   *   The section synonym or NULL.
 
   * \param $professor
 
   *   The section's professor or NULL.
 
   * \param $section_meeting
 
   *   The SectionMeeting to be added to a section which may or may
 
   *   not already be in this Semester.
 
   */
 
  public function section_meeting_add($dept, $course, $title, $section, $synonym, $professor, $section_meeting)
 
  {
 
    $dept = strtoupper($dept);
 
    $course = strtoupper($course);
 

	
 
    if (empty($this->departments[$dept][$course]))
 
      $course_obj = NULL;
 
    else
 
      {
 
	$course_obj = $this->departments[$dept][$course];
 
	$section_obj = $course_obj->section_get($section);
 
      }
 
    if (empty($course_obj) || empty($section_obj))
 
      return $this->section_add($dept, $course, new Section($section, array($section_meeting), $synonym, $professor), $title);
 

	
 
    $section_obj->meeting_add($section_meeting);
 
    return;
 
  }
 

	
 
  /**
 
   * \brief
 
   *   Update the time_end.
 
   *
 
   * The time_end is a unix timestamp roughly estimating the time at
 
@@ -280,6 +328,18 @@ class Semester
 

	
 
  /**
 
   * \brief
 
   *   Handle conversion to a string.
 
   *
 
   * \return
 
   *   A string.
 
   */
 
  public function __tostring()
 
  {
 
    return $this->name_get();
 
  }
 

	
 
  /**
 
   * \brief
 
   *   Return an identification string for this semester.
 
   *
 
   * Hopefully this identification string should be unique. Also, this
inc/school.crawl.inc
Show inline comments
 
@@ -217,6 +217,36 @@ function school_crawl_days_str_format($d
 

	
 
/**
 
 * \brief
 
 *   Try to guess a more standardized section_meeting type.
 
 *
 
 * \param $meeting_type
 
 *   The upstream's meeting_type, such as 'LEC', 'lec', 'LAB',
 
 *   etc. New mappings should be added to this function as long as
 
 *   they are general enough.
 
 */
 
function school_crawl_meeting_type($meeting_type = 'lecture')
 
{
 
  static $meeting_type_maps =
 
    array(
 
	  'lec' => 'lecture',
 
	  'lab' => 'lab',
 
	  'dis' => 'discussion',
 
	  );
 

	
 
  if (empty($meeting_type))
 
    $meeting_type = 'lecture';
 

	
 
  $meeting_type = strtolower($meeting_type);
 
  if (!empty($meeting_type_maps[$meeting_type]))
 
    $meeting_type = $meeting_type_maps[$meeting_type];
 
  elseif (!empty($meeting_type_maps[substr($meeting_type, 0, 3)]))
 
    $meeting_type = $meeting_type_maps[substr($meeting_type, 0, 3)];
 

	
 
  return $meeting_type;
 
}
 

	
 
/**
 
 * \brief
 
 *   Simulate some aspects of a web browser while retreiving a
 
 *   document.
 
 *
 
@@ -577,8 +607,10 @@ function school_crawl_url($orig_url, $ur
 
 * \param $column_name
 
 *   The name of the column to search for.
 
 * \param $strcmp
 
 *   The function to use with a strcmp() interface when judging
 

	
 
 *   The function to use with a strcmp($text_content, $column_name) interface when judging
 
 *   whether or not a <th />'s textContent matches $column_name.
 

	
 
 * \param $trim
 
 *   The function to apply to the <th />'s textContent before
 
 *   subjecting it to the $strcmp test.
 
@@ -591,17 +623,32 @@ function school_crawl_table_resolve_colu
 
{
 
  $th_nodelist = school_crawl_table_rownodes($tr_node);
 
  for ($i = 0; $i < $th_nodelist->length; $i ++)
 
    if (!$strcmp($column_name, $trim($th_nodelist->item($i)->textContent)))
 
    if (!$strcmp($trim($th_nodelist->item($i)->textContent), $column_name))
 
      return $i;
 
  return FALSE;
 
}
 

	
 
/**
 
 * \brief
 
 *   A strcmp() compatible function for testing regular expressions
 
 *   for school_crawl_table_resolve_column()'s $strcmp argument.
 
 */
 
function school_crawl_table_resolve_column_regexcmp($text_content, $column_name_regex)
 
{
 
  if (preg_match($column_name_regex, $text_content))
 
    return 0;
 
  return 1;
 
}
 

	
 
/**
 
 * \brief
 
 *   Get a DOMNodeList of a row's elements without #text elements in
 
 *   the way.
 
 *
 
 * Helpful when using school_crawl_table_resolve_column() to get data.
 
 *
 
 * \return
 
 *   A DOMNodeList.
 
 */
 
function school_crawl_table_rownodes(DOMElement $tr_node)
 
{
school.d/umich.crawl.inc
Show inline comments
 
@@ -18,56 +18,14 @@
 
 * along with SlatePermutate.  If not, see <http://www.gnu.org/licenses/>.
 
 */
 

	
 

	
 
/** Filter out whitepace items */
 
function umich_arrayfilter_callback($item)
 
{
 
  if(ltrim($item) == '')
 
    return TRUE;
 
  else
 
    return TRUE;
 
}
 

	
 
/** Parse html at URL into array, first row is row headers */
 
function umich_table_parse($url)
 
{
 
  $arr = array();
 
  $dom = new DOMDocument;
 
  $html = file_get_contents($url);
 
  if(!$html){
 
    return 1;
 
  }
 
  $dom->loadHTML($html);
 
  $dom->preserveWhiteSpace = false;
 
  $tables = $dom->getElementsByTagName('table');
 
  $rows = $tables->item(3)->getElementsByTagName('tr'); // Get first table on page 
 
  foreach ($rows as $rownum => $row) {
 
    if($rownum > 5) {
 
      $cols = $row->getElementsByTagName('td');
 
      foreach($cols as $colnum => $col){
 
        $arr[$rownum][$colnum] = $col->nodeValue;
 
      }
 
    }
 
  }
 
  foreach($arr as &$item) {
 
    $item = array_filter($item, "umich_arrayfilter_callback");
 
  }
 

	
 
  $arr = array_values($arr); // Reindex array
 
 
 
  // Strip navigation and trailing garbage
 
  $arr[count($arr)-3] = NULL;
 
  $arr[count($arr)-2] = NULL;
 
  $arr[count($arr)-1] = NULL;
 

	
 
  $arr = array_filter($arr);
 
  return $arr;
 
}
 

	
 
/**
 
 * \brief
 
 *  Crawls University of Michigan's schedule.
 
 *
 
 * Potential startpoints:
 
 * - http://lsa.umich.edu/cg/cg_advsearch.aspx (HTML/curl-based)
 
 * - http://ro.umich.edu/schedule/ (harder HTML for semester guessing, one CSV download for entire semester -- <=4MB)
 
 *
 
 * \param $semesters
 
 *   An array to be filled with semesters.
 
 * \param $school_crawl_log
 
@@ -77,26 +35,243 @@ function umich_table_parse($url)
 
 */
 
function umich_crawl(array &$semesters, $school_crawl_log)
 
{
 
  $url = 'http://lsa.umich.edu/cg/cg_advsearch.aspx';
 
  $url = 'http://ro.umich.edu/schedule/';
 
  $cookies = array();
 

	
 
  /* determine list of semesters: */
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML(school_crawl_geturi($url, $cookies, $school_crawl_log));
 
  $semesters_xpath = new DOMXPath($semesters_dom);
 

	
 
  $year = substr($semester->year_get(), 2);
 
  $season = strtolower(substr($semester->season_get(), 0, 1));
 
  $tables_nodelist = $semesters_dom->getElementsByTagName('table');
 
  foreach ($tables_nodelist as $table)
 
    {
 
      $table_tr = NULL;
 
      foreach ($semesters_xpath->query('tr', $table) as $table_tr)
 
	break;
 
      if (empty($table_tr))
 
	{
 
	  school_crawl_logf($school_crawl_log, 5, "Unable to find first row in table which I suspect to be a table holding all of the semesters I'm interested in.");
 
	  continue;
 
	}
 

	
 
      $semester_columns = array(
 
				'name' => school_crawl_table_resolve_column($table_tr, 'Term'),
 
				'csv' => school_crawl_table_resolve_column($table_tr, '/[cC][sS][vV]/', 'school_crawl_table_resolve_column_regexcmp'),
 
				);
 
      foreach ($semester_columns as $semester_column_name => $semester_column)
 
	if ($semester_column === FALSE)
 
	  {
 
	    school_crawl_logf($school_crawl_log, 4, "Unable to resolve column %s onto a column in a semester listing table. Skipping this table.",
 
			      $semester_column_name);
 
	    $semester_columns = NULL;
 
	  }
 
      if (empty($semester_columns))
 
	continue;
 

	
 
      $first = TRUE;
 
      foreach ($semesters_xpath->query('tr', $table) as $table_tr)
 
	if ($first)
 
	  {
 
	    $first = FALSE;
 
	    continue;
 
	  }
 
	else
 
	  {
 
	    $rownodes = school_crawl_table_rownodes($table_tr);
 
	    $semester_name = $rownodes->item($semester_columns['name']);
 
	    $semester_csv = $rownodes->item($semester_columns['csv']);
 

	
 
	    if (!preg_match('/^(.+) ([0-9]+)$/', $semester_name->textContent, $matches))
 
	      {
 
		school_crawl_logf($school_crawl_log, 4, "Unable to parse semester name `%s'. Skipping this semester.",
 
				  $semester_name->textContent);
 
		continue;
 
	      }
 
	    $semester = new Semester($matches[2], $matches[1]);
 

	
 
  /* Current academic departments. Update as needed. */
 
  $departments = array('AAPTIS','ACABS','AERO','AEROSP','AMCULT','ANTHRARC','ANTHRBIO','ANTHRCUL','AOSS','APPPHYS','ARCH','ARMENIAN','ARTDES','ASIAN','ASIANLAN','ASTRO','AUTO','BCS','BIOINF','BIOLCHEM','BIOLOGY','BIOMEDE','BIOPHYS','CAAS','CEE','CHE','CHEM','CIC','CICS','CJS','CLARCH','CLCIV','CMPLXSYS','COMM','COMP','COMPLIT','CSP','CZECH','DANCE','DUTCH','ECON','EDCURINS','EDUC','EEB','EECS','ELI','ENGLISH','ENGR','ENSCEN','ENVIRON','ESENG','FRENCH','GEOG','GEOSCI','GERMAN','GREEK','GTBOOKS','HBEHED','HISTART','HISTORY','HJCS','HMP','HONORS','INTMED','IOE','ITALIAN','JAZZ','JUDAIC','KINESLGY','LACS','LATIN','LHC','LHSP','LING','MACROMOL','MATH','MATSCIE','MCDB','MECHENG','MEDADM','MEDCHEM','MEMS','MENAS','MFG','MICROBIOL','MILSCI','MKT','MODGREEK','MOVESCI','MUSEUMS','MUSICOL','MUSMETH','MUSTHTRE','NAVARCH','NAVSCI','NERS','NEUROSCI','NRE','NURS','OMS','ORGSTUDY','PAT','PATH','PHARMACY','PHIL','PHRMACOL','PHYSICS','PHYSIOL','POLISH','POLSCI','PORTUG','PSYCH','PUBHLTH','PUBPOL','RCARTS','RCCORE','RCHUMS','RCIDIV','RCLANG','RCNSCI','RCSSCI','REEES','RELIGION','ROMLANG','ROMLING','RUSSIAN','SAC','SAS','SCAND','SEAS','SI','SLAVIC','SOC','SPANISH','STATS','STDABRD','SWC','TCHNCLCM','THEORY','THTREMUS','UC','UKRAINE','UP','WOMENSTD','YIDDISH');
 
	    $a = NULL;
 
	    foreach ($semesters_xpath->query('descendant::a', $semester_csv) as $a)
 
	      break;
 
	    if (empty($a) || !$a->hasAttribute('href'))
 
	      {
 
		school_crawl_logf($school_crawl_log, 4, "Unable to find <a /> element with an href attribute for a CSV link for the %s semester. Skipping this semester. (textContent of csv column: %s)",
 
				  $semester, $semester_csv->textContent);
 
		continue;
 
	      }
 
	    if (!umich_crawl_csv($school_crawl_log, $semester, $a->getAttribute('href')))
 
	      {
 
		$semesters[] = $semester;
 
		/**
 
		 * \todo
 
		 *   If we try to crawl more than one umich semester,
 
		 *   PHP runs out of memory. We need to bump our API
 
		 *   and rehash script to support incremental crawling
 
		 *   or early data committing if we want umich
 
		 *   crawling to work for more than one semester.
 
		 */
 
		return 0;
 
	      }
 
	    else
 
	      school_crawl_logf($school_crawl_log, 2, "Unable to interpret CSV information for %s. Skipping semester.",
 
				$semester);
 
	  }
 
    }
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Handle the crawling of one semester of umich.
 
 *
 
 * \param $school_crawl_log
 
 *   The school_crawl_log handle.
 
 * \param $semester
 
 *   A Semester object to populate with courses from this semester.
 
 * \param $csv_href
 
 *   A link to a CSV file which will be downloaded and parsed.
 
 */
 
function umich_crawl_csv($school_crawl_log, &$semester, $csv_href)
 
{
 
  school_crawl_logf($school_crawl_log, 3, "Crawling %s.",
 
		    $semester);
 

	
 
  $cookies = array();
 
  $uri = $csv_href;
 

	
 
  /* parse into lines and then each row needs to be individually parsed */
 
  $csv = str_getcsv(school_crawl_geturi($uri, $cookies, $school_crawl_log), PHP_EOL);
 

	
 
  $basepath = "http://www.lsa.umich.edu/cg/cg_results.aspx";
 
  $yearsyn = 1800 + $year; // Weird year synonym name where 2000 == 1800
 
  $basepath .= "?termArray={$season}_{$year}_${yearsyn}&cgtype=ug";
 
  $season = strtolower($season);
 
  $tables = array();
 
  foreach($departments as $department) {
 
   $tables[$department] = umich_table_parse($basepath . '&department=' . $department . '&allsections=true&show=1000');
 
  }
 
  return $tables;
 
  $fields = array(
 
		  'Term' => FALSE /* $semester->season_get() . ' ' . $semester->year_get() */,
 
		  'Session' => FALSE /* "Regular Academic Session", "First 7 Week Session", "Second 7 Week Session" <-- half-semester support? */,
 
		  'Acad Group' => FALSE /* long version of the department sorta, more general than the subject field */,
 
		  'Class Nbr' => FALSE /* unqualified course_id */,
 
		  'Subject' => FALSE /* "Mathematics (MATH)" */,
 
		  'Catalog Nbr' => FALSE /* "10001", i.e. section synonym */,
 
		  'Section' => FALSE /* You still reading these comments? */,
 
		  'Course Title' => FALSE /* for your sake, I hope you aren't */,
 
		  'Component' => FALSE /* "LAB", "LEC", "REC" -- i.e., meeting_type(?) */,
 
		  'Codes' => FALSE /* "P  W", "P   ", "P R ", "PI  ", "A   ", "P RW" ??????? (reminds me of ``svn status''). If flag[3] = 'W', then the class has a meeting times */,
 
		  'M' => FALSE /* if a day is enabled, it is set to itself. I.e., $row['M'] = 'M' or $row['M'] = '' */,
 
		  'T' => FALSE,
 
		  'W' => FALSE,
 
		  'TH' => FALSE,
 
		  'F' => FALSE,
 
		  'S' => FALSE,
 
		  'SU' => FALSE /* OK, we'll have to add Sunday support someday ;-) */,
 
		  'Start Date' => FALSE /* yea! */,
 
		  'End Date' => FALSE /* "12/13/2011" */,
 
		  'Time' => FALSE /* "1230-130PM", "9-1030AM", "1130-1PM" */,
 
		  'Location' => FALSE,
 
		  'Instructor' => FALSE,
 
		  'Units' => FALSE /* As in credit hours */,
 
		  );
 
  $ignored_fields = array(
 
			  'Term' => TRUE,
 
			  'Session' => TRUE,
 
			  'Acad Group' => TRUE,
 
			  'Codes' => TRUE,
 
			  'SU' => TRUE,
 
			  'Units' => TRUE,
 
			  );
 

	
 
  foreach (str_getcsv($csv[0]) as $col_num => $col_name)
 
    if (isset($fields[$col_name]))
 
      $fields[$col_name] = $col_num;
 
    else
 
      school_crawl_logf($school_crawl_log, 6, "We do not recognize the %s column in the CSV file for %s.",
 
			$col_name, $semester);
 

	
 
  foreach ($fields as $field => $col_num)
 
    if ($col_num === FALSE
 
	&& empty($ignored_field[$field]))
 
      {
 
	school_crawl_logf($school_crawl_log, 2, "Unable to find column %s in CSV for %s. Skipping this semester.",
 
			  $field, $semester);
 
	return 1;
 
      }
 

	
 
  /* remove the row with heading from the CSV dataset */
 
  unset($csv[0]);
 

	
 
  /* Now actually parse some data :-). */
 
  foreach ($csv as $row)
 
    {
 
      $row = str_getcsv($row);
 
      $synonym = trim($row[$fields['Catalog Nbr']]);
 

	
 
      if (!preg_match(';\(([A-Z]+)\)$;', $row[$fields['Subject']], $matches))
 
	{
 
	  school_crawl_logf($school_crawl_log, 5, "Unable to parse department string `%s'. Skipping section/course (synonym=%s).",
 
			    $row[$fields['Subject']], $synonym);
 
	  continue;
 
	}
 
      $dept = $matches[1];
 

	
 
      $days = '';
 
      foreach (array('M' => 'm', 'T' => 't', 'W' => 'w', 'TH' => 'h', 'F' => 'f', 'S' => 's')
 
	       as $field => $day)
 
	if (!strlen(trim($row[$fields[$field]])))
 
	  $days .= $day;
 

	
 
      if (!preg_match(';^([0-9]+)-([0-9]+)([AP])M$;', $row[$fields['Time']], $matches))
 
	{
 
	  school_crawl_logf($school_crawl_log, 4, "Unable to parse meeting time: `%s'. Skipping section/meeting (synonym=%s).",
 
			    $row[$fields['Time']], $synonym);
 
	  /* ensure that the class is added nonetheless */
 
	  if ($semester->class_get($dept, $row[$fields['Class Nbr']]) === NULL)
 
	    $semester->class_add(new Course($dept . '-' . $row[$fields['Class Nbr']], $row[$fields['Course Title']]));
 
	  continue;
 
	}
 
      $time_end = umich_crawl_time($matches[2], $matches[3]);
 
      $time_start = umich_crawl_time($matches[1], FALSE, $time_end);
 

	
 
      $semester->section_meeting_add($dept, $row[$fields['Class Nbr']], $row[$fields['Course Title']],
 
				     $row[$fields['Section']], $row[$fields['Catalog Nbr']], $row[$fields['Instructor']],
 
				     new SectionMeeting($days, $time_start, $time_end, $row[$fields['Location']], school_crawl_meeting_type($row[$fields['Component']])));
 
    }
 
}
 

	
 
/**
 
 * \brief
 
 *   Try to turn a umich-formatted time into something usable.
 
 *
 
 * \param $raw
 
 *   The raw input.
 
 * \param $xm
 
 *   FALSE or, if PM or AM was specified, 'P' for PM and 'A' for AM.
 
 * \param $before
 
 *   A time of day before which this time must be. Used generally for
 
 *   the start time of a class. The end time of a class must be parsed
 
 *   first so that the result of that calculation may be passed as the
 
 *   $before value.
 
 */
 
function umich_crawl_time($raw, $xm = FALSE, $before = '2400')
 
{
 
  $h = $raw;
 
  $m = '00';
 
  if (strlen($raw) > 2)
 
    {
 
      $h = substr($raw, 0, strlen($raw) - 2);
 
      $m = substr($raw, strlen($raw) - 2);
 
    }
 

	
 
  $before_h = substr($before, 0, 2);
 
  $before_m = substr($before, 2);
 

	
 
  if ($xm === FALSE)
 
    {
 
      /* if the time could feasibly be in the afternoon, assume it is: */
 
      if (($h + 12) * 60 + $m < $before_h * 60 + $before_m)
 
	$xm = 'P';
 
      else
 
	$xm = 'A';
 
    }
 

	
 
  if (!strcmp($xm, 'P'))
 
    $h += 12;
 

	
 
  return sprintf('%02d%02d', $h, $m);
 
}
0 comments (0 inline, 0 general)