Changeset - 1d417d9e6bb3
[Not reviewed]
default
0 2 0
Nathan Brink (binki) - 15 years ago 2010-10-16 20:24:58
ohnobinki@ohnopublishing.net
Cedarville crawler converted to use the generalized crawling interface; it now populates the Semester object with actual section data. Cedarville has some strange course constrainst, however, which would require changes to the core algorithm.
2 files changed with 188 insertions and 15 deletions:
0 comments (0 inline, 0 general)
inc/school.crawl.inc
Show inline comments
 
@@ -54,14 +54,15 @@ function school_crawl_time_format($time)
 
 *   An array of day names. These may be common abbreviations or
 
 *   truncations (any truncations must be two chars long for
 
 *   simplicity. One-char representations are supported, however, but
 
 *   use 'm', 't', 'w', 'h', 'f' to distinguish thursday and
 
 *   friday). Case does not matter.
 
 *   use 'm', 't', 'w', 'h', 'f' to distinguish Thursday and
 
 *   Tuesday. 'r' may also be used for Thursday.). Case does not
 
 *   matter.
 
 * \return
 
 *   slate_permutate's strange internal days representation.
 
 */
 
function school_crawl_days_format($days)
 
{
 
  static $daymap_1 = array('m' => 1, 't' => 2, 'w' => 3, 'h' => 4, 'f' => 5);
 
  static $daymap_1 = array('m' => 1, 't' => 2, 'w' => 3, 'h' => 4, 'r' => 4, 'f' => 5);
 
  static $daymap_2 = array('th' => 'h');
 

	
 
  $my_days = array();
 
@@ -94,3 +95,21 @@ function school_crawl_days_format($days)
 

	
 
  return $day_str;
 
}
 

	
 
/**
 
 * \brief
 
 *   Take a string of day initials and format it.
 
 *
 
 * \param $days_str
 
 *   Example input: 'mwf', 'TR'.
 
 * \return
 
 *   Same as school_crawl_days_format()
 
 */
 
function school_crawl_days_str_format($days_str)
 
{
 
  $day_initials = array();
 
  for ($i = 0; $i < strlen($days_str); $i ++)
 
    $day_initials[] = $days_str[$i];
 

	
 
  return school_crawl_days_format($day_initials);
 
}
school.d/cedarville.inc
Show inline comments
 
@@ -28,16 +28,22 @@ function cedarville_instructions_html()
 
EOF;
 
}
 

	
 
/** Parse html at URL into array, first row is row headers */
 
function table_parse($url) {
 
/**
 
 * \brief
 
 *   Parse given html into an array, first row is row headers
 
 *
 
 * \param $html
 
 *   HTML that PHP's DOM would willingly would eat.
 
 */
 
function table_parse($html)
 
{
 
  $arr = array();
 
  $dom = new DOMDocument;
 
  $html = file_get_contents($url);
 
  if(!$html){
 
    return 1;
 
  }
 
  if(!$html)
 
    return NULL;
 

	
 
  $dom->loadHTML($html);
 
  $dom->preserveWhiteSpace = false;
 
  $dom->preserveWhiteSpace = FALSE;
 
  $tables = $dom->getElementsByTagName('table');
 
  $rows = $tables->item(0)->getElementsByTagName('tr'); // Get first table on page 
 
  foreach ($rows as $rownum => $row) {
 
@@ -50,7 +56,7 @@ function table_parse($url) {
 
}
 

	
 
/** Crawls Cedarville course listings. $season is "fa" or "sp", year is 4-digit year */
 
function cedarville_crawl($semester)
 
function cedarville_crawl($semester, $verbosity = 1)
 
{  
 
  $season = strtolower(substr($semester->season_get(), 0, 2));
 
  $year = $semester->year_get();
 
@@ -61,9 +67,157 @@ function cedarville_crawl($semester)
 

	
 
  $season = strtolower($season);
 
  $tables = array();
 
  foreach($departments as $department) {
 
    $tables[$department] = table_parse($basepath . $year . $season . '_' . $department . '_' . 'all.htm');
 
  }
 
  return $tables;
 
  foreach($departments as $department)
 
    {
 
      $html = file_get_contents($basepath . $year . $season . '_' . $department . '_' . 'all.htm');
 
      if (!$html)
 
	continue;
 
      $tables[$department] = table_parse(cedarville_html_fix($html));
 
    }
 

	
 
  foreach ($tables as $dept_table)
 
    {
 
      /*
 
       * Discard the first row, which has the contents of the <th />
 
       * elements.
 
       */
 
      unset($dept_table[0]);
 

	
 
      foreach($dept_table as $course_table)
 
	{
 
	  /*
 
	   * format:
 
	   * 0: course synonym, an unsigned integer.
 
	   * 1: section spec, parsable by Section::parse().
 
	   * 2: friendly course title.
 
	   * 3: Instructor name.
 
	   * 4: Number of credit hours in decimal notation.
 
	   * 5: Fee.
 
	   * 6: Meeting time, explained below.
 
	   * 7: Cap.
 
	   * 8-10: Textbook link. Most rows only have column 8, not
 
	   *       all the way through 10. This information seems
 
	   *       quite useless.
 
	   *
 
	   * Section meeting time/place format:
 
	   *
 
	   * Confusing example: ' ILB  WI219   TR    08:30A-09:45A'
 
	   * Complete example plus lab: ' LEC  TYL203  MWF   08:00A-08:50A LAB  ENS118  TR    03:00P-04:30P'
 
	   *
 
	   * Appears to have format:
 
	   * <meeting_info>: <type> <room> <days> <time_start>-<time_end> <meeting_info>
 
	   *
 
	   * It appears tht <type> may be:
 
	   * LEC: normal lecture meeting.
 
	   * ONL: online course.
 
	   * ILB: ethan says a partially online course...?
 
	   * HYB: hybrid of...?
 
	   * FLD: field...?
 
	   * FE2: ?
 
	   * CLN: ?
 
	   * LAB: Lab
 
	   * LES: something for some PFMU/PLMU class?
 
	   */
 

	
 
	  $synonym = $course_table[0];
 
	  $section_parts = Section::parse($course_table[1]);
 
	  if (count($section_parts) < 3)
 
	    {
 
	      error_log('Error parsing section_id. Given `' . $course_table[1] . '\', interpreted as `'
 
			. implode('-', $section_parts) . '\'. Skipping.');
 
	      continue;
 
	    }
 

	
 
	  $instructor = $course_table[3];
 

	
 
	  /*
 
	   * Each course may have multiple meeting times associated
 
	   * with it at Cedarville. We are not sure how to handle this
 
	   * quite, because different class sections may be tied with
 
	   * different lab meetings and stuff...
 
	   */
 
	  $meetings_str = $course_table[6];
 
	  if (strpos($meetings_str, 'TBA') !== FALSE)
 
	    {
 
	      if ($verbosity > 1)
 
		error_log('Skipping ' . implode('-', $section_parts) . ' because its meeting time info has `TBA\' in it.');
 
	      continue;
 
	    }
 
	  $meetings = array();
 
	  $meeting_multiple_types = array();
 
	  while (strlen($meetings_str) > 5)
 
	    {
 
	      if (!preg_match(';^ ([A-Z]+) +([A-Z]+[A-Z0-9]*) +([MTWRF]{1,5}) +([0-9:AP]+)-([0-9:AP]+);',
 
			      $meetings_str, $meeting_matches))
 
		{
 
		  if (preg_match(';^Dates:[^0-9]+([/0-9]{8})-([/0-9]{8});',
 
				 $meetings_str, $meeting_matches))
 
		    {
 
		      if ($verbosity > 4)
 
			error_log('Skipping some meeting data for '
 
				  . implode('-', $section_parts) . ' because it is a date range: `'
 
				  . $meeting_matches[0] . '\'');
 
		      $meetings_str = substr($meetings_str, strlen($meeting_matches[0]));
 
		      continue;
 
		    }
 

	
 
		  if ($verbosity > 0)
 
		    error_log('Error parsing meeting time. Given `' . $meetings_str . '\'. Skipping '
 
			      . implode('-', $section_parts));
 
		  break;
 
		}
 
	      /* prepare for parsing the next meeting time */
 
	      $meetings_str = substr($meetings_str, strlen($meeting_matches[0]));
 

	
 
	      if (isset($meetings[$meeting_matches[1]]))
 
		{
 
		  if ($verbosity > 0 && !isset($meeting_multiple_types[$meeting_matches[1]]))
 
		    {
 
		      error_log('Section ' . implode('-', $section_parts)
 
				. ' has multiple meeting times for meeting_type of '
 
				. $meeting_matches[1] . ' which my unflexible code which'
 
				. ' could be made more flexible doesn\'t yet support.'
 
				. ' Skipping the extra meeting times for this type of meeting.');
 
		      /* only give the above error once per type. */
 
		      $meeting_multiple_types[$meeting_matches[1]] = TRUE;
 
		    }
 
		  continue;
 
		}
 

	
 
	      $meetings[$meeting_matches[1]]
 
		= array('room' => $meeting_matches[2],
 
			'days' => school_crawl_days_str_format($meeting_matches[3]),
 
			'time_start' => school_crawl_time_format(strptime($meeting_matches[4] . 'M', '%I:%M%p')),
 
			'time_end' => school_crawl_time_format(strptime($meeting_matches[5] . 'M', '%I:%M%p')),
 
			'type' => $meeting_matches[1], 
 
			);
 
	    }
 

	
 
	  foreach ($meetings as $meeting)
 
	    {
 
	      $section_letter = $section_parts['section'];
 
	      if ($meeting['type'] == 'LECT')
 
		/**
 
		 * \todo this might not make much sense.		 
 
		 */
 
		$section_letter = 'L' . $section_letter;
 
	      $semester->section_add($section_parts['department'], $section_parts['course'],
 
				     new Section($section_letter, $instructor,
 
						 $meeting['time_start'], $meeting['time_end'],
 
						 $meeting['days']));
 
	    }
 
	}
 
    }
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Fix some incorrect usage of the HTML entity delimiter, the ampersand.
 
 */
 
function cedarville_html_fix($html)
 
{
 
  $html = preg_replace('/&&/', '&amp;&', $html);
 
  return preg_replace('/&([^;]{5})/', '&amp;$1', $html);
 
}
0 comments (0 inline, 0 general)