# HG changeset patch
# User Nathan Phillip Brink <ohnobinki@ohnopublishing.net>
# Date 2011-04-09 16:07:12
# Node ID 775e75832d2e1775387f4dcdb66412dd86974cd7
# Parent  fbcf85c2f1bbe49b8ca63284fa9e5e9917bc23a0

Support multiple section meetings for school_id=calvin. I was warned about this by Tom Graham but I never listened ;-). Fixes bug 109.

diff --git a/inc/school.crawl.inc b/inc/school.crawl.inc
--- a/inc/school.crawl.inc
+++ b/inc/school.crawl.inc
@@ -240,7 +240,7 @@ function school_crawl_meeting_type($meet
   if (empty($meeting_type))
     $meeting_type = 'lecture';
 
-  $meeting_type = strtolower($meeting_type);
+  $meeting_type = strtolower(trim($meeting_type));
   if (!empty($meeting_type_maps[$meeting_type]))
     $meeting_type = $meeting_type_maps[$meeting_type];
   elseif (!empty($meeting_type_maps[substr($meeting_type, 0, 3)]))
diff --git a/school.d/calvin.crawl.inc b/school.d/calvin.crawl.inc
--- a/school.d/calvin.crawl.inc
+++ b/school.d/calvin.crawl.inc
@@ -49,28 +49,8 @@ function calvin_crawl(array &$semesters,
 
   $cookies = array();
 
-  $baseuri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
-
-  $token_uri = $baseuri . '&TOKENIDX=NULL';
-  $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($token_uri, $cookies, $school_crawl_log));
-  if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches))
-    {
-      school_crawl_logf($school_crawl_log, 1, "Could not steal the token: crawling failed.");
-      return 1;
-    }
-  $token = $matches[1];
-
-  school_crawl_logf($school_crawl_log, 7, "token: %s.", $token);
-  school_crawl_logf($school_crawl_log, 7, "");
-
-  /*
-   * here we have arrived at the main webadvisor screen which lists the
-   * search form. From here, we can get a list of all of the departments
-   * that Calvin College has and then know enough to query each
-   * individual department for courses.
-   */
-  $uri = $baseuri . '&TOKENIDX=' . $token;
-  $departments_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
+  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
+  $departments_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
 
   $departments_dom = new DOMDocument();
   $departments_dom->loadHTML($departments_html);
@@ -253,10 +233,10 @@ function calvin_crawl(array &$semesters,
 	  /* either 'Open' (or 'Closed'?) */
 	  $openness = dom_input_value($results_dom, 'LIST.VAR1_' . $list_row);
 	  $sec_short_title = dom_id_content($results_dom, 'SEC_SHORT_TITLE_' . $list_row);
-	  $sec_meeting_info = dom_id_content($results_dom, 'SEC_MEETING_INFO_' . $list_row);
+	  $sec_meetings_info = dom_id_content($results_dom, 'SEC_MEETING_INFO_' . $list_row);
 
 	  /* check if we're done with this particular page */
-	  if (!strlen($openness) && !strlen($sec_short_title) && !strlen($sec_meeting_info))
+	  if (!strlen($openness) && !strlen($sec_short_title) && !strlen($sec_meetings_info))
 	    {
 	      $list_done = TRUE;
 	      break;
@@ -269,6 +249,7 @@ function calvin_crawl(array &$semesters,
 	  $faculty_name = dom_input_value($results_dom, 'SEC.FACULTY.INFO_' . $list_row);
 	  $credits = dom_input_value($results_dom, 'SEC.MIN.CRED_' . $list_row); /* or id="SEC_FACULTY_INFO_$list_row" */
 	  $comment = dom_id_content($results_dom, 'SEC_COMMENTS_' . $list_row); /* or name="SEC.COMMENTS_$list_row" */
+	  $short_title_onclick = $results_dom->getElementById('SEC_SHORT_TITLE_' . $list_row)->getAttribute('onclick');
 
 	  /* parse */
 	  $section_id = Section::parse($sec_short_title);
@@ -283,7 +264,7 @@ function calvin_crawl(array &$semesters,
 	  school_crawl_logf($school_crawl_log, 10, "");
 	  school_crawl_logf($school_crawl_log, 10, implode('-', $section_id) . ': ' . $sec_short_title);
 	  school_crawl_logf($school_crawl_log, 10, $openness);
-	  school_crawl_logf($school_crawl_log, 10, $sec_meeting_info);
+	  school_crawl_logf($school_crawl_log, 10, $sec_meetings_info);
 	  school_crawl_logf($school_crawl_log, 10, $faculty_name);
 	  school_crawl_logf($school_crawl_log, 10, $credits);
 	  school_crawl_logf($school_crawl_log, 10, $comment);
@@ -304,8 +285,22 @@ function calvin_crawl(array &$semesters,
 	   *
 	   * '01/31/2011-05/12/2011 Music Ensemble Monday, Wednesday, Thursday, Friday 03:30PM - 04:20PM, Covenant Fine Arts Center, Room 135'
 	   *
+	   * OR, per
+	   * https://protofusion.org/bugzilla/show_bug.cgi?id=109 , we
+	   * must parse the following on the main listing page and
+	   * then parse more on the ``course details'' page:
+	   *
+	   * '09/06/2011-12/16/2011 Lecture Tuesday, Wednesday, Friday 12:30PM - 01:20PM, Science Building, Room 276 (more)...'
+	   *
+	   * The more on the ``course details'' page:
+	   *
+	   * '09/06/2011-12/16/2011 Lecture Tuesday, Wednesday, Friday 12:30PM - 01:20PM, Science Building, Room 276 09/06/2011-12/16/2011 Lecture Thursday 10:30AM - 12:20PM, Science Building, Room 276'
+	   *
+	   * Looks like in this last case parsing from right-to-left
+	   * will be best.
+	   *
 	   * In the second case.... we'll just ignore the section. In
-	   * the last case, we have to be careful about parsing out
+	   * the third case, we have to be careful about parsing out
 	   * Monday.
 	   *
 	   * At this point, we don't parse most tokens. We group them
@@ -314,19 +309,58 @@ function calvin_crawl(array &$semesters,
 	   * the list of days of week the section meets, the start
 	   * time, the end time, and then the meeting location.
 	   */
-	  if (strpos($sec_meeting_info, 'Times to be Announced') !== FALSE
-	      || strpos($sec_meeting_info, 'Days to be Announced') !== FALSE)
+	  if (strpos($sec_meetings_info, 'Times to be Announced') !== FALSE
+	      || strpos($sec_meetings_info, 'Days to be Announced') !== FALSE)
 	    {
 	      school_crawl_logf($school_crawl_log, 8, 'Skipping class because of incomplete meeting time information: '
 				. implode('-', $section_id) . ' has meeting info of `'
-				. $sec_meeting_info . '\'');
+				. $sec_meetings_info . '\'');
 	      $skipped_sections['incomplete meeting info'] ++;
 	      /* Still add to have less confusing autocomplete */
 	      calvin_crawl_course_add($semester, $section_id['department'], $section_id['course'], $title);
 	      continue;
 	    }
 
-	  if (!preg_match(';^([0-9]{2}/[0-9]{2}/[0-9]{4})-([0-9]{2}/[0-9]{2}/[0-9]{4}) (([^ ,]+ )+)([^0-9]+) ([^ ]+) - ([^ ]+), (.*)$;', $sec_meeting_info, $meeting_info_matches))
+	  /*
+	   * Check whether or not we have to pursue details on the
+	   * ``course detail page''. If we do, we might as well just
+	   * parse the line of information available there instead of
+	   * the same from the main listing page.
+	   */
+	  if (preg_match('; \\(more\\)...$;', $sec_meetings_info)
+	      && preg_match(';^javascript:window\\.open\\(\'(.*?[^\\\\])\',;', $short_title_onclick, $short_title_onclick_matches))
+	    {
+	      $more_details_url = $short_title_onclick_matches[1];
+	      $more_details_uri = strstr($uri, '?', TRUE) . $more_details_url;
+
+	      school_crawl_logf($school_crawl_log, 8, 'Fetching extra course information page for %s-%s-%s from %s.',
+				$section_id['department'], $section_id['course'], $section_id['section'],
+				$more_details_uri);
+	      $more_details_html = calvin_crawl_geturi($more_details_uri, $cookies, $school_crawl_log);
+	      $more_details_dom = new DOMDocument();
+	      $more_details_dom->loadHTML($more_details_html);
+
+	      /* Hopefully 'LIST_VAR12_1' is pretty constant... */
+	      foreach ($more_details_dom->getElementById('LIST_VAR12_1')->childNodes as $more_details_child)
+		{
+		  if ($more_details_child->nodeType != XML_TEXT_NODE)
+		    continue;
+		  $sec_meetings_info = $more_details_child->wholeText;
+		  break;
+		}
+	      school_crawl_log($school_crawl_log, 8, "Result of fetching additional meeting information on next line(s):\n%s",
+			       $sec_meetings_info);
+	    }
+
+	  /*
+	   * If we have a course with multiple section_meetings, then
+	   * $sec_meetings_info is split into each meeting by a
+	   * "\n"
+	   */
+
+	  foreach (explode("\n", $sec_meetings_info) as $sec_meeting_info)
+	    {
+	      if (!preg_match(';^([0-9]{2}/[0-9]{2}/[0-9]{4})-([0-9]{2}/[0-9]{2}/[0-9]{4}) (([^ ,]+ )+)([^0-9]+) ([^ ]+) - ([^ ]+), (.*)$;', $sec_meeting_info, $meeting_info_matches))
 	    {
 	      school_crawl_logf($school_crawl_log, 8, 'Unable to parse calvin section meeting info string into start/end/days information for '
 				. implode('-', $section_id) . ': ``' . $sec_meeting_info . '\'\'');
@@ -341,7 +375,7 @@ function calvin_crawl(array &$semesters,
 	  $date_start = $meeting_info_matches[1];
 	  $date_end = $meeting_info_matches[2];
 	  /* e.g., 'Lecture', 'Practicum' */
-	  $meeting_type = strtolower(trim($meeting_info_matches[3]));
+	  $meeting_type = school_crawl_meeting_type($meeting_info_matches[3]);
 
 	  $days = school_crawl_days_format($school_crawl_log, explode(', ', $meeting_info_matches[5]));
 	  $time_start = school_crawl_time_format(strptime($meeting_info_matches[6], '%I:%M%p'));
@@ -351,8 +385,8 @@ function calvin_crawl(array &$semesters,
 	  foreach (array('date_start', 'date_end', 'meeting_type', 'days', 'time_start', 'time_end', 'meeting_place', 'meeting_type') as $var)
 	    school_crawl_logf($school_crawl_log, 10, "%s:%s", $var, ${$var});
 
-	  $section = new Section($section_id['section'], array(new SectionMeeting($days, $time_start, $time_end, $meeting_place, $meeting_type, $faculty_name)), $synonym);
-	  $semester->section_add($section_id['department'], $section_id['course'], $section, $title);
+	  $semester->section_meeting_add($section_id['department'], $section_id['course'], $title, $section_id['section'], $synonym,
+					 new SectionMeeting($days, $time_start, $time_end, $meeting_place, $meeting_type, $faculty_name));
 
 	  /*
 	   * Try to update semester's longetivity stats to help the
@@ -373,6 +407,7 @@ function calvin_crawl(array &$semesters,
 		$semester_end_max = $date_end_time;
 	    }
 	}
+	}
 
       if (!preg_match(';Page ([0-9]+) of ([0-9]+)\</td\>$;m', $html, $pages))
 	{
@@ -403,6 +438,7 @@ function calvin_crawl(array &$semesters,
     /*
      * Calculate lab-based course dependencies.
      */
+    school_crawl_logf($school_crawl_log, 7, 'Adding implicit lab dependencies.');
     foreach ($semester->departments_get() as $department)
       foreach ($semester->department_classes_get($department) as $course)
         {
@@ -491,6 +527,60 @@ function calvin_crawl_noscript_filter($h
 
 /**
  * \brief
+ *   Follows a URL with support for WebAdvisor's silly TOKENIDX=
+ *   thing.
+ *
+ * Automatically filters with calvin_crawl_noscript_filter().
+ *
+ * \param $uri
+ *   The URL.
+ * \param $cookies
+ *   The cookies (yum!).
+ * \param $school_crawl_log
+ *   The school_crawl_log.
+ */
+function calvin_crawl_geturi(&$uri, array &$cookies, &$school_crawl_log)
+{
+  if (strpos($uri, 'TOKENIDX') === FALSE)
+    {
+      if (strpos($uri, '?') === FALSE)
+	$uri .= '?';
+      else
+	$uri .= '&';
+
+      /* Starting value. */
+      $uri .= 'TOKENIDX=NULL';
+    }
+
+  $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
+
+  if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches))
+    return $token_html;
+$token = $matches[1];
+
+  school_crawl_logf($school_crawl_log, 7, "Using WebAdvisor token: %s.", $token);
+  school_crawl_logf($school_crawl_log, 7, "");
+
+  /*
+   * setWindowHTML() will first remove the query string parameters
+   * 'CLONE' and 'FORCEIDX'. Then it appends TOKENIDX=<token> to the
+   * query parameters.
+   *
+   * Example, where TOKENIDX does not start out as NULL but where a
+   * CLONE=Y command is being sent:
+   *
+   * Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&CONSTITUENCY=WBST&TOKENIDX=1507971558
+   *
+   * Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&CONSTITUENCY=WBST&TOKENIDX=2281086932
+   */
+  $uri = preg_replace('/([?&])TOKENIDX=[^&]+/', '$1TOKENIDX=' . $token,
+		      preg_replace('/([?&])(CLONE|FORCEIDX)=[^&]+&?/', '$1', $uri));
+
+  return calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
+}
+
+/**
+ * \brief
  *   Add a course to a semester if that semester doesn't yet have this
  *   course.
  *