SlatePermutate Changeset - 634b866e665d · protofusion repositories

Changeset - 634b866e665d

Parent rev.

Child rev.

[Not reviewed]

default

0 1 0

Nathan Brink (binki) - 14 years ago 2011-10-16 02:30:53
ohnobinki@ohnopublishing.net

cedarville: Correctly parse courses with multiple professors who are associated with particular section meetings. Fixes bug #96.

1 file changed with 48 insertions and 10 deletions:

school.d/cedarville.crawl.inc

0 comments (0 inline, 0 general)

school.d/cedarville.crawl.inc

➞

Show inline comments

@@ @@ -22,40 +22,40 @@ @@
  * \file
  * \brief
  *   Crawler implementation for Cedarville University.
  */
 /**
  * \brief
  *   Parse given html into an array, first row is row headers.
+ *
  * \param $html
  *   HTML that PHP's DOM would willingly would eat.
  */
-function table_parse($html)
+function cedarville_table_parse($html)
+{
   libxml_use_internal_errors(true); // Suppress warnings
   $arr = array();
   $dom = new DOMDocument;
   if(!$html)
     return NULL;
   $dom->loadHTML($html);
   $dom->preserveWhiteSpace = FALSE;
   $tables = $dom->getElementsByTagName('table');
   $rows = $tables->item(0)->getElementsByTagName('tr'); // Get first table on page
   foreach ($rows as $rownum => $row) {
     $cols = $row->getElementsByTagName('td');
     foreach($cols as $colnum => $col){
-      $arr[$rownum][$colnum] = $col->nodeValue;
       $arr[$rownum][$colnum] = $col;
+    }
+  }
   return $arr;
+}
 define('CEDARVILLE_BASE_URI', 'http://cedarville.edu/courses/schedule/');
 define('CEDARVILLE_TIMEZONE_OFFSET', 60*60 * -4);
 /**
  * \brief
  *   Obtain the list of crawlable semesters offered by Cedarville.
+ *
@@ @@ -146,25 +146,25 @@ function cedarville_crawl_semester(array @@
     return 1;
   $tables = array();
   $cookies = array();
   foreach ($departments as $department => $dept_name)
+    {
       school_crawl_logf($school_crawl_log, 7, "Crawling department %s (%s).", $department, $dept_name);
       $uri = CEDARVILLE_BASE_URI . $season_string . '_' . $department . '_all.htm';
       $html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
       if (!$html)
 	continue;
-      $tables[$department] = table_parse(cedarville_html_fix($html));
+      $tables[$department] = cedarville_table_parse(cedarville_html_fix($html));
+    }
   foreach ($tables as $dept_table)
+    {
       /*
        * Discard the first row, which has the contents of the <th />
        * elements.
        */
       unset($dept_table[0]);
       foreach($dept_table as $course_table)
+	{
@@ @@ -193,49 +193,76 @@ function cedarville_crawl_semester(array @@
 	   * It appears tht <type> may be:
 	   * LEC: normal lecture meeting.
 	   * ONL: online course.
 	   * ILB: ethan says a partially online course...?
 	   * HYB: hybrid of...?
 	   * FLD: field...?
 	   * FE2: ?
 	   * CLN: ?
 	   * LAB: Lab
 	   * LES: something for some PFMU/PLMU class?
 	   */
 	  $synonym = $course_table[0];
 	  $section_parts = Section::parse($course_table[1]);
 	  $synonym = $course_table[0]->nodeValue;
 	  $section_parts = Section::parse($course_table[1]->nodeValue);
 	  if (count($section_parts) < 3)
+	    {
 	      school_crawl_logf($school_crawl_log, 6, "Error parsing section_id. Given `%s'; interpreted as `%s'. Skipping.",
 				$course_table[1], implode('-', $section_parts));
+				$course_table[1]->nodeValue, implode('-', $section_parts));
 	      continue;
+	    }
 	  $instructor = $course_table[3];
           $title = $course_table[2];
           $title = $course_table[2]->nodeValue;
 	  /*
 	   * For courses with multiple section meetings, each
 	   * instructor for each section meeting is separated by <br/>.
 	   */
 	  $instructors = array('');
 	  foreach ($course_table[3]->childNodes as $child)
 	    switch ($child->nodeType)
+	      {
 	      case XML_ELEMENT_NODE:
 		end($instructors);
 		if (!strcmp($child->tagName, 'br')
 		    && strlen(trim($instructors[key($instructors)])))
 		  $instructors[] = '';
 		else
+		  {
 		    end($instructors);
 		    $instructors[key($instructors)] .= $child->nodeValue;
+		  }
 		break;
 	      case XML_TEXT_NODE:
 		end($instructors);
 		$instructors[key($instructors)] .= $child->data;
 		break;
+	      }
 	  foreach ($instructors as $key => $instructor)
 	    $instructors[$key] = trim($instructor);
 	  /*
 	   * Each course may have multiple meeting times associated
 	   * with it at Cedarville. We are not sure how to handle this
 	   * quite, because different class sections may be tied with
 	   * different lab meetings and stuff...
 	   */
 	  $meetings_str = $course_table[6];
+	  $meetings_str = $course_table[6]->nodeValue;
 	  if (strpos($meetings_str, 'TBA') !== FALSE)
+	    {
 	      school_crawl_logf($school_crawl_log, 8, "Skipping %s because its meeting time info has `TBA' in it.", implode('-', $section_parts));
 	      continue;
+	    }
 	  $meetings = array();
 	  $meeting_i = 0;
 	  $meeting_multiple_types = array();
 	  while (strlen($meetings_str) > 5)
+	    {
 	      $meeting_start_regex = ';^';
 	      $meeting_base_regex = ' ([A-Z]+) +([A-Z]+[A-Z0-9]*) +([MTWRF]{1,5}) +([0-9:AP]+)-([0-9:AP]+)';
 	      $meeting_date_regex = 'Dates:[^0-9]+([/0-9]{8})-([/0-9]{8})';
 	      $meeting_end_regex = ';';
 	      if (!preg_match($meeting_start_regex . $meeting_base_regex . $meeting_date_regex . $meeting_end_regex,
 			      $meetings_str, $meeting_matches)
 		  && !preg_match($meeting_start_regex . $meeting_base_regex . $meeting_end_regex,
 				 $meetings_str, $meeting_matches))
+		{
@@ @@ -265,26 +292,37 @@ function cedarville_crawl_semester(array @@
 	      /* check for daterange information -- i.e., if the first regex successfully matched: */
 	      if (count($meeting_matches) > 7)
+		{
 		  $date_start = school_crawl_gmmktime(strptime($meeting_matches[6], '%m/%d/%y'), CEDARVILLE_TIMEZONE_OFFSET);
 		  $date_end = school_crawl_gmmktime(strptime($meeting_matches[7], '%m/%d/%y'), CEDARVILLE_TIMEZONE_OFFSET);
 		  if (!empty($date_start) && !empty($date_end))
+		    {
 		      $semester->time_start_set_test($date_start);
 		      $semester->time_end_set_test($date_end);
+		    }
+		}
 	      /*
 	       * The tables are made for humans, not computers. If
 	       * there aren't enough instructors for the number of
 	       * section meetings, just reuse the first listed
 	       * instructor:
 	       */
 	      if ($meeting_i >= count($instructors))
 		$instructors[$meeting_i] = $instructors[0];
 	      $meetings[] = new SectionMeeting($days, $time_start, $time_end,
 					       $room, $type, $instructor);
 					       $room, $type, $instructors[$meeting_i]);
 	      $meeting_i ++;
+	    }
 	  $semester->section_add($section_parts['department'], $section_parts['course'],
 				 new Section($section_parts['section'], $meetings,
 					     $synonym), $title);
+	}
+    }
   return 0;
+}
 /**

0 comments (0 inline, 0 general)