Changeset - 634b866e665d
[Not reviewed]
default
0 1 0
Nathan Brink (binki) - 14 years ago 2011-10-16 02:30:53
ohnobinki@ohnopublishing.net
cedarville: Correctly parse courses with multiple professors who are associated with particular section meetings. Fixes bug #96.
1 file changed with 48 insertions and 10 deletions:
0 comments (0 inline, 0 general)
school.d/cedarville.crawl.inc
Show inline comments
 
@@ -28,13 +28,13 @@
 
 * \brief
 
 *   Parse given html into an array, first row is row headers.
 
 *
 
 * \param $html
 
 *   HTML that PHP's DOM would willingly would eat.
 
 */
 
function table_parse($html)
 
function cedarville_table_parse($html)
 
{
 
  libxml_use_internal_errors(true); // Suppress warnings
 
  $arr = array();
 
  $dom = new DOMDocument;
 
  if(!$html)
 
    return NULL;
 
@@ -43,13 +43,13 @@ function table_parse($html)
 
  $dom->preserveWhiteSpace = FALSE;
 
  $tables = $dom->getElementsByTagName('table');
 
  $rows = $tables->item(0)->getElementsByTagName('tr'); // Get first table on page 
 
  foreach ($rows as $rownum => $row) {
 
    $cols = $row->getElementsByTagName('td');
 
    foreach($cols as $colnum => $col){
 
      $arr[$rownum][$colnum] = $col->nodeValue;
 
      $arr[$rownum][$colnum] = $col;
 
    }
 
  }
 
  return $arr;
 
}
 

	
 
define('CEDARVILLE_BASE_URI', 'http://cedarville.edu/courses/schedule/');
 
@@ -152,13 +152,13 @@ function cedarville_crawl_semester(array
 
      school_crawl_logf($school_crawl_log, 7, "Crawling department %s (%s).", $department, $dept_name);
 

	
 
      $uri = CEDARVILLE_BASE_URI . $season_string . '_' . $department . '_all.htm';
 
      $html = school_crawl_geturi($uri, $cookies, $school_crawl_log);
 
      if (!$html)
 
	continue;
 
      $tables[$department] = table_parse(cedarville_html_fix($html));
 
      $tables[$department] = cedarville_table_parse(cedarville_html_fix($html));
 
    }
 

	
 
  foreach ($tables as $dept_table)
 
    {
 
      /*
 
       * Discard the first row, which has the contents of the <th />
 
@@ -199,37 +199,64 @@ function cedarville_crawl_semester(array
 
	   * FE2: ?
 
	   * CLN: ?
 
	   * LAB: Lab
 
	   * LES: something for some PFMU/PLMU class?
 
	   */
 

	
 
	  $synonym = $course_table[0];
 
	  $section_parts = Section::parse($course_table[1]);
 
	  $synonym = $course_table[0]->nodeValue;
 
	  $section_parts = Section::parse($course_table[1]->nodeValue);
 
	  if (count($section_parts) < 3)
 
	    {
 
	      school_crawl_logf($school_crawl_log, 6, "Error parsing section_id. Given `%s'; interpreted as `%s'. Skipping.",
 
				$course_table[1], implode('-', $section_parts));
 
				$course_table[1]->nodeValue, implode('-', $section_parts));
 
	      continue;
 
	    }
 

	
 
	  $instructor = $course_table[3];
 
          $title = $course_table[2];
 
          $title = $course_table[2]->nodeValue;
 

	
 
	  /*
 
	   * For courses with multiple section meetings, each
 
	   * instructor for each section meeting is separated by <br/>.
 
	   */
 
	  $instructors = array('');
 
	  foreach ($course_table[3]->childNodes as $child)
 
	    switch ($child->nodeType)
 
	      {
 
	      case XML_ELEMENT_NODE:
 
		end($instructors);
 
		if (!strcmp($child->tagName, 'br')
 
		    && strlen(trim($instructors[key($instructors)])))
 
		  $instructors[] = '';
 
		else
 
		  {
 
		    end($instructors);
 
		    $instructors[key($instructors)] .= $child->nodeValue;
 
		  }
 
		break;
 
	      case XML_TEXT_NODE:
 
		end($instructors);
 
		$instructors[key($instructors)] .= $child->data;
 
		break;
 
	      }
 
	  foreach ($instructors as $key => $instructor)
 
	    $instructors[$key] = trim($instructor);
 

	
 
	  /*
 
	   * Each course may have multiple meeting times associated
 
	   * with it at Cedarville. We are not sure how to handle this
 
	   * quite, because different class sections may be tied with
 
	   * different lab meetings and stuff...
 
	   */
 
	  $meetings_str = $course_table[6];
 
	  $meetings_str = $course_table[6]->nodeValue;
 
	  if (strpos($meetings_str, 'TBA') !== FALSE)
 
	    {
 
	      school_crawl_logf($school_crawl_log, 8, "Skipping %s because its meeting time info has `TBA' in it.", implode('-', $section_parts));
 
	      continue;
 
	    }
 
	  $meetings = array();
 
	  $meeting_i = 0;
 
	  $meeting_multiple_types = array();
 
	  while (strlen($meetings_str) > 5)
 
	    {
 
	      $meeting_start_regex = ';^';
 
	      $meeting_base_regex = ' ([A-Z]+) +([A-Z]+[A-Z0-9]*) +([MTWRF]{1,5}) +([0-9:AP]+)-([0-9:AP]+)';
 
	      $meeting_date_regex = 'Dates:[^0-9]+([/0-9]{8})-([/0-9]{8})';
 
@@ -271,14 +298,25 @@ function cedarville_crawl_semester(array
 
		    {
 
		      $semester->time_start_set_test($date_start);
 
		      $semester->time_end_set_test($date_end);
 
		    }
 
		}
 

	
 
	      /*
 
	       * The tables are made for humans, not computers. If
 
	       * there aren't enough instructors for the number of
 
	       * section meetings, just reuse the first listed
 
	       * instructor:
 
	       */
 
	      if ($meeting_i >= count($instructors))
 
		$instructors[$meeting_i] = $instructors[0];
 

	
 
	      $meetings[] = new SectionMeeting($days, $time_start, $time_end,
 
					       $room, $type, $instructor);
 
					       $room, $type, $instructors[$meeting_i]);
 

	
 
	      $meeting_i ++;
 
	    }
 

	
 
	  $semester->section_add($section_parts['department'], $section_parts['course'],
 
				 new Section($section_parts['section'], $meetings,
 
					     $synonym), $title);
 
	}
0 comments (0 inline, 0 general)