SlatePermutate Changeset - f59baf5fa7dc · protofusion repositories

Changeset - f59baf5fa7dc

Parent rev.

Child rev.

[Not reviewed]

default

0 3 0

Nathan Brink (binki) - 15 years ago 2011-03-25 10:58:11
ohnobinki@ohnopublishing.net

Support crawling a single semester of umich. Closes bug 64.

3 files changed with 344 insertions and 62 deletions:

inc/class.semester.inc

inc/school.crawl.inc

school.d/umich.crawl.inc

235

0 comments (0 inline, 0 general)

inc/class.semester.inc

➞

Show inline comments

@@ @@ -144,6 +144,9 @@ class Semester @@
    *   Utility function to add a section to the semester,
    *   automatically creating classes as necessary.
+   *
    * Crawler functions should generally use this instead of
    * Semester::class_add().
+   *
    * \param $dept
    *   The department this section belongs to.
    * \param $class
@@ @@ -174,6 +177,51 @@ class Semester @@
   /**
    * \brief
    *   Add a section_meeting, calling Semester::section_add() as
    *   necessary.
+   *
    * To be used by crawlers when parsing data which only presents one
    * section_meeting at a time. I.e., when they do tabular data right.
+   *
    * \param $dept
    *   The department this section_meeting's course belongs to.
    * \param $course
    *   The course number this section_meeting's section belongs to.
    * \param $title
    *   The course title of the given course the section_meeting or
    *   NULL.
    *   belongs to.
    * \param $section
    *   The letter or numbers which make up the section's name.
    * \param $synonym
    *   The section synonym or NULL.
    * \param $professor
    *   The section's professor or NULL.
    * \param $section_meeting
    *   The SectionMeeting to be added to a section which may or may
    *   not already be in this Semester.
    */
   public function section_meeting_add($dept, $course, $title, $section, $synonym, $professor, $section_meeting)
+  {
     $dept = strtoupper($dept);
     $course = strtoupper($course);
     if (empty($this->departments[$dept][$course]))
       $course_obj = NULL;
     else
+      {
 	$course_obj = $this->departments[$dept][$course];
 	$section_obj = $course_obj->section_get($section);
+      }
     if (empty($course_obj) || empty($section_obj))
       return $this->section_add($dept, $course, new Section($section, array($section_meeting), $synonym, $professor), $title);
     $section_obj->meeting_add($section_meeting);
     return;
+  }
   /**
    * \brief
    *   Update the time_end.
+   *
    * The time_end is a unix timestamp roughly estimating the time at
@@ @@ -280,6 +328,18 @@ class Semester @@
   /**
    * \brief
    *   Handle conversion to a string.
+   *
    * \return
    *   A string.
    */
   public function __tostring()
+  {
     return $this->name_get();
+  }
   /**
    * \brief
    *   Return an identification string for this semester.
+   *
    * Hopefully this identification string should be unique. Also, this

inc/school.crawl.inc

➞

Show inline comments

@@ @@ -217,6 +217,36 @@ function school_crawl_days_str_format($d @@
 /**
  * \brief
  *   Try to guess a more standardized section_meeting type.
+ *
  * \param $meeting_type
  *   The upstream's meeting_type, such as 'LEC', 'lec', 'LAB',
  *   etc. New mappings should be added to this function as long as
  *   they are general enough.
  */
 function school_crawl_meeting_type($meeting_type = 'lecture')
+{
   static $meeting_type_maps =
     array(
 	  'lec' => 'lecture',
 	  'lab' => 'lab',
 	  'dis' => 'discussion',
 	  );
   if (empty($meeting_type))
     $meeting_type = 'lecture';
   $meeting_type = strtolower($meeting_type);
   if (!empty($meeting_type_maps[$meeting_type]))
     $meeting_type = $meeting_type_maps[$meeting_type];
   elseif (!empty($meeting_type_maps[substr($meeting_type, 0, 3)]))
     $meeting_type = $meeting_type_maps[substr($meeting_type, 0, 3)];
   return $meeting_type;
+}
 /**
  * \brief
  *   Simulate some aspects of a web browser while retreiving a
  *   document.
+ *
@@ @@ -577,8 +607,10 @@ function school_crawl_url($orig_url, $ur @@
  * \param $column_name
  *   The name of the column to search for.
  * \param $strcmp
  *   The function to use with a strcmp() interface when judging
  *   The function to use with a strcmp($text_content, $column_name) interface when judging
  *   whether or not a <th />'s textContent matches $column_name.
  * \param $trim
  *   The function to apply to the <th />'s textContent before
  *   subjecting it to the $strcmp test.
@@ @@ -591,17 +623,32 @@ function school_crawl_table_resolve_colu @@
+{
   $th_nodelist = school_crawl_table_rownodes($tr_node);
   for ($i = 0; $i < $th_nodelist->length; $i ++)
-    if (!$strcmp($column_name, $trim($th_nodelist->item($i)->textContent)))
+    if (!$strcmp($trim($th_nodelist->item($i)->textContent), $column_name))
       return $i;
   return FALSE;
+}
 /**
  * \brief
  *   A strcmp() compatible function for testing regular expressions
  *   for school_crawl_table_resolve_column()'s $strcmp argument.
  */
 function school_crawl_table_resolve_column_regexcmp($text_content, $column_name_regex)
+{
   if (preg_match($column_name_regex, $text_content))
     return 0;
   return 1;
+}
 /**
  * \brief
  *   Get a DOMNodeList of a row's elements without #text elements in
  *   the way.
+ *
  * Helpful when using school_crawl_table_resolve_column() to get data.
+ *
  * \return
  *   A DOMNodeList.
  */
 function school_crawl_table_rownodes(DOMElement $tr_node)
+{

school.d/umich.crawl.inc

➞

Show inline comments

@@ @@ -18,56 +18,14 @@ @@
  * along with SlatePermutate.  If not, see <http://www.gnu.org/licenses/>.
  */
 /** Filter out whitepace items */
 function umich_arrayfilter_callback($item)
+{
   if(ltrim($item) == '')
     return TRUE;
   else
     return TRUE;
+}
 /** Parse html at URL into array, first row is row headers */
 function umich_table_parse($url)
+{
   $arr = array();
   $dom = new DOMDocument;
   $html = file_get_contents($url);
   if(!$html){
     return 1;
+  }
   $dom->loadHTML($html);
   $dom->preserveWhiteSpace = false;
   $tables = $dom->getElementsByTagName('table');
   $rows = $tables->item(3)->getElementsByTagName('tr'); // Get first table on page
   foreach ($rows as $rownum => $row) {
     if($rownum > 5) {
       $cols = $row->getElementsByTagName('td');
       foreach($cols as $colnum => $col){
         $arr[$rownum][$colnum] = $col->nodeValue;
+      }
+    }
+  }
   foreach($arr as &$item) {
     $item = array_filter($item, "umich_arrayfilter_callback");
+  }
   $arr = array_values($arr); // Reindex array
   // Strip navigation and trailing garbage
   $arr[count($arr)-3] = NULL;
   $arr[count($arr)-2] = NULL;
   $arr[count($arr)-1] = NULL;
   $arr = array_filter($arr);
   return $arr;
+}
 /**
  * \brief
  *  Crawls University of Michigan's schedule.
+ *
  * Potential startpoints:
  * - http://lsa.umich.edu/cg/cg_advsearch.aspx (HTML/curl-based)
  * - http://ro.umich.edu/schedule/ (harder HTML for semester guessing, one CSV download for entire semester -- <=4MB)
+ *
  * \param $semesters
  *   An array to be filled with semesters.
  * \param $school_crawl_log
@@ @@ -77,26 +35,243 @@ function umich_table_parse($url) @@
  */
 function umich_crawl(array &$semesters, $school_crawl_log)
+{
-  $url = 'http://lsa.umich.edu/cg/cg_advsearch.aspx';
+  $url = 'http://ro.umich.edu/schedule/';
   $cookies = array();
   /* determine list of semesters: */
   $semesters_dom = new DOMDocument();
   $semesters_dom->loadHTML(school_crawl_geturi($url, $cookies, $school_crawl_log));
   $semesters_xpath = new DOMXPath($semesters_dom);
   $year = substr($semester->year_get(), 2);
   $season = strtolower(substr($semester->season_get(), 0, 1));
   $tables_nodelist = $semesters_dom->getElementsByTagName('table');
   foreach ($tables_nodelist as $table)
+    {
       $table_tr = NULL;
       foreach ($semesters_xpath->query('tr', $table) as $table_tr)
 	break;
       if (empty($table_tr))
+	{
 	  school_crawl_logf($school_crawl_log, 5, "Unable to find first row in table which I suspect to be a table holding all of the semesters I'm interested in.");
 	  continue;
+	}
       $semester_columns = array(
 				'name' => school_crawl_table_resolve_column($table_tr, 'Term'),
 				'csv' => school_crawl_table_resolve_column($table_tr, '/[cC][sS][vV]/', 'school_crawl_table_resolve_column_regexcmp'),
 				);
       foreach ($semester_columns as $semester_column_name => $semester_column)
 	if ($semester_column === FALSE)
+	  {
 	    school_crawl_logf($school_crawl_log, 4, "Unable to resolve column %s onto a column in a semester listing table. Skipping this table.",
 			      $semester_column_name);
 	    $semester_columns = NULL;
+	  }
       if (empty($semester_columns))
 	continue;
       $first = TRUE;
       foreach ($semesters_xpath->query('tr', $table) as $table_tr)
 	if ($first)
+	  {
 	    $first = FALSE;
 	    continue;
+	  }
 	else
+	  {
 	    $rownodes = school_crawl_table_rownodes($table_tr);
 	    $semester_name = $rownodes->item($semester_columns['name']);
 	    $semester_csv = $rownodes->item($semester_columns['csv']);
 	    if (!preg_match('/^(.+) ([0-9]+)$/', $semester_name->textContent, $matches))
+	      {
 		school_crawl_logf($school_crawl_log, 4, "Unable to parse semester name `%s'. Skipping this semester.",
 				  $semester_name->textContent);
 		continue;
+	      }
 	    $semester = new Semester($matches[2], $matches[1]);
   /* Current academic departments. Update as needed. */
   $departments = array('AAPTIS','ACABS','AERO','AEROSP','AMCULT','ANTHRARC','ANTHRBIO','ANTHRCUL','AOSS','APPPHYS','ARCH','ARMENIAN','ARTDES','ASIAN','ASIANLAN','ASTRO','AUTO','BCS','BIOINF','BIOLCHEM','BIOLOGY','BIOMEDE','BIOPHYS','CAAS','CEE','CHE','CHEM','CIC','CICS','CJS','CLARCH','CLCIV','CMPLXSYS','COMM','COMP','COMPLIT','CSP','CZECH','DANCE','DUTCH','ECON','EDCURINS','EDUC','EEB','EECS','ELI','ENGLISH','ENGR','ENSCEN','ENVIRON','ESENG','FRENCH','GEOG','GEOSCI','GERMAN','GREEK','GTBOOKS','HBEHED','HISTART','HISTORY','HJCS','HMP','HONORS','INTMED','IOE','ITALIAN','JAZZ','JUDAIC','KINESLGY','LACS','LATIN','LHC','LHSP','LING','MACROMOL','MATH','MATSCIE','MCDB','MECHENG','MEDADM','MEDCHEM','MEMS','MENAS','MFG','MICROBIOL','MILSCI','MKT','MODGREEK','MOVESCI','MUSEUMS','MUSICOL','MUSMETH','MUSTHTRE','NAVARCH','NAVSCI','NERS','NEUROSCI','NRE','NURS','OMS','ORGSTUDY','PAT','PATH','PHARMACY','PHIL','PHRMACOL','PHYSICS','PHYSIOL','POLISH','POLSCI','PORTUG','PSYCH','PUBHLTH','PUBPOL','RCARTS','RCCORE','RCHUMS','RCIDIV','RCLANG','RCNSCI','RCSSCI','REEES','RELIGION','ROMLANG','ROMLING','RUSSIAN','SAC','SAS','SCAND','SEAS','SI','SLAVIC','SOC','SPANISH','STATS','STDABRD','SWC','TCHNCLCM','THEORY','THTREMUS','UC','UKRAINE','UP','WOMENSTD','YIDDISH');
 	    $a = NULL;
 	    foreach ($semesters_xpath->query('descendant::a', $semester_csv) as $a)
 	      break;
 	    if (empty($a) || !$a->hasAttribute('href'))
+	      {
 		school_crawl_logf($school_crawl_log, 4, "Unable to find <a /> element with an href attribute for a CSV link for the %s semester. Skipping this semester. (textContent of csv column: %s)",
 				  $semester, $semester_csv->textContent);
 		continue;
+	      }
 	    if (!umich_crawl_csv($school_crawl_log, $semester, $a->getAttribute('href')))
+	      {
 		$semesters[] = $semester;
 		/**
 		 * \todo
 		 *   If we try to crawl more than one umich semester,
 		 *   PHP runs out of memory. We need to bump our API
 		 *   and rehash script to support incremental crawling
 		 *   or early data committing if we want umich
 		 *   crawling to work for more than one semester.
 		 */
 		return 0;
+	      }
 	    else
 	      school_crawl_logf($school_crawl_log, 2, "Unable to interpret CSV information for %s. Skipping semester.",
 				$semester);
+	  }
+    }
   return 0;
+}
 /**
  * \brief
  *   Handle the crawling of one semester of umich.
+ *
  * \param $school_crawl_log
  *   The school_crawl_log handle.
  * \param $semester
  *   A Semester object to populate with courses from this semester.
  * \param $csv_href
  *   A link to a CSV file which will be downloaded and parsed.
  */
 function umich_crawl_csv($school_crawl_log, &$semester, $csv_href)
+{
   school_crawl_logf($school_crawl_log, 3, "Crawling %s.",
 		    $semester);
   $cookies = array();
   $uri = $csv_href;
   /* parse into lines and then each row needs to be individually parsed */
   $csv = str_getcsv(school_crawl_geturi($uri, $cookies, $school_crawl_log), PHP_EOL);
   $basepath = "http://www.lsa.umich.edu/cg/cg_results.aspx";
   $yearsyn = 1800 + $year; // Weird year synonym name where 2000 == 1800
   $basepath .= "?termArray={$season}_{$year}_${yearsyn}&cgtype=ug";
   $season = strtolower($season);
   $tables = array();
   foreach($departments as $department) {
    $tables[$department] = umich_table_parse($basepath . '&department=' . $department . '&allsections=true&show=1000');
+  }
   return $tables;
   $fields = array(
 		  'Term' => FALSE /* $semester->season_get() . ' ' . $semester->year_get() */,
 		  'Session' => FALSE /* "Regular Academic Session", "First 7 Week Session", "Second 7 Week Session" <-- half-semester support? */,
 		  'Acad Group' => FALSE /* long version of the department sorta, more general than the subject field */,
 		  'Class Nbr' => FALSE /* unqualified course_id */,
 		  'Subject' => FALSE /* "Mathematics (MATH)" */,
 		  'Catalog Nbr' => FALSE /* "10001", i.e. section synonym */,
 		  'Section' => FALSE /* You still reading these comments? */,
 		  'Course Title' => FALSE /* for your sake, I hope you aren't */,
 		  'Component' => FALSE /* "LAB", "LEC", "REC" -- i.e., meeting_type(?) */,
 		  'Codes' => FALSE /* "P  W", "P   ", "P R ", "PI  ", "A   ", "P RW" ??????? (reminds me of ``svn status''). If flag[3] = 'W', then the class has a meeting times */,
 		  'M' => FALSE /* if a day is enabled, it is set to itself. I.e., $row['M'] = 'M' or $row['M'] = '' */,
 		  'T' => FALSE,
 		  'W' => FALSE,
 		  'TH' => FALSE,
 		  'F' => FALSE,
 		  'S' => FALSE,
 		  'SU' => FALSE /* OK, we'll have to add Sunday support someday ;-) */,
 		  'Start Date' => FALSE /* yea! */,
 		  'End Date' => FALSE /* "12/13/2011" */,
 		  'Time' => FALSE /* "1230-130PM", "9-1030AM", "1130-1PM" */,
 		  'Location' => FALSE,
 		  'Instructor' => FALSE,
 		  'Units' => FALSE /* As in credit hours */,
 		  );
   $ignored_fields = array(
 			  'Term' => TRUE,
 			  'Session' => TRUE,
 			  'Acad Group' => TRUE,
 			  'Codes' => TRUE,
 			  'SU' => TRUE,
 			  'Units' => TRUE,
 			  );
   foreach (str_getcsv($csv[0]) as $col_num => $col_name)
     if (isset($fields[$col_name]))
       $fields[$col_name] = $col_num;
     else
       school_crawl_logf($school_crawl_log, 6, "We do not recognize the %s column in the CSV file for %s.",
 			$col_name, $semester);
   foreach ($fields as $field => $col_num)
     if ($col_num === FALSE
 	&& empty($ignored_field[$field]))
+      {
 	school_crawl_logf($school_crawl_log, 2, "Unable to find column %s in CSV for %s. Skipping this semester.",
 			  $field, $semester);
 	return 1;
+      }
   /* remove the row with heading from the CSV dataset */
   unset($csv[0]);
   /* Now actually parse some data :-). */
   foreach ($csv as $row)
+    {
       $row = str_getcsv($row);
       $synonym = trim($row[$fields['Catalog Nbr']]);
       if (!preg_match(';\(([A-Z]+)\)$;', $row[$fields['Subject']], $matches))
+	{
 	  school_crawl_logf($school_crawl_log, 5, "Unable to parse department string `%s'. Skipping section/course (synonym=%s).",
 			    $row[$fields['Subject']], $synonym);
 	  continue;
+	}
       $dept = $matches[1];
       $days = '';
       foreach (array('M' => 'm', 'T' => 't', 'W' => 'w', 'TH' => 'h', 'F' => 'f', 'S' => 's')
 	       as $field => $day)
 	if (!strlen(trim($row[$fields[$field]])))
 	  $days .= $day;
       if (!preg_match(';^([0-9]+)-([0-9]+)([AP])M$;', $row[$fields['Time']], $matches))
+	{
 	  school_crawl_logf($school_crawl_log, 4, "Unable to parse meeting time: `%s'. Skipping section/meeting (synonym=%s).",
 			    $row[$fields['Time']], $synonym);
 	  /* ensure that the class is added nonetheless */
 	  if ($semester->class_get($dept, $row[$fields['Class Nbr']]) === NULL)
 	    $semester->class_add(new Course($dept . '-' . $row[$fields['Class Nbr']], $row[$fields['Course Title']]));
 	  continue;
+	}
       $time_end = umich_crawl_time($matches[2], $matches[3]);
       $time_start = umich_crawl_time($matches[1], FALSE, $time_end);
       $semester->section_meeting_add($dept, $row[$fields['Class Nbr']], $row[$fields['Course Title']],
 				     $row[$fields['Section']], $row[$fields['Catalog Nbr']], $row[$fields['Instructor']],
 				     new SectionMeeting($days, $time_start, $time_end, $row[$fields['Location']], school_crawl_meeting_type($row[$fields['Component']])));
+    }
+}
 /**
  * \brief
  *   Try to turn a umich-formatted time into something usable.
+ *
  * \param $raw
  *   The raw input.
  * \param $xm
  *   FALSE or, if PM or AM was specified, 'P' for PM and 'A' for AM.
  * \param $before
  *   A time of day before which this time must be. Used generally for
  *   the start time of a class. The end time of a class must be parsed
  *   first so that the result of that calculation may be passed as the
  *   $before value.
  */
 function umich_crawl_time($raw, $xm = FALSE, $before = '2400')
+{
   $h = $raw;
   $m = '00';
   if (strlen($raw) > 2)
+    {
       $h = substr($raw, 0, strlen($raw) - 2);
       $m = substr($raw, strlen($raw) - 2);
+    }
   $before_h = substr($before, 0, 2);
   $before_m = substr($before, 2);
   if ($xm === FALSE)
+    {
       /* if the time could feasibly be in the afternoon, assume it is: */
       if (($h + 12) * 60 + $m < $before_h * 60 + $before_m)
 	$xm = 'P';
       else
 	$xm = 'A';
+    }
   if (!strcmp($xm, 'P'))
     $h += 12;
   return sprintf('%02d%02d', $h, $m);
+}

0 comments (0 inline, 0 general)