Changeset - d6fbd191da57
[Not reviewed]
default
0 1 0
Nathan Brink (binki) - 13 years ago 2012-04-26 01:33:07
ohnobinki@ohnopublishing.net
Remove unnecessary CONSTITUENCY GET parameter from the Calvin College crawler.
1 file changed with 6 insertions and 6 deletions:
0 comments (0 inline, 0 general)
school.d/calvin.crawl.inc
Show inline comments
 
@@ -22,64 +22,64 @@
 
 * \brief
 
 *   Retrieve a list of crawlable semesters from Calvin College.
 
 *
 
 * \param $school
 
 *   The calvin school handle.
 
 * \param $semesters
 
 *   The array to populate with empty Semester objects.
 
 * \param $school_crawl_log
 
 *   A school_crawl_log handle for informing the user/developer of
 
 *   progress.
 
 */
 
function calvin_crawl_semester_list(array $school, array &$semesters, &$school_crawl_log)
 
{
 
  $season_map = array(
 
		      'FA' => Semester::SEASON_FALL,
 
		      'IN' => 'interim',
 
		      'SP' => Semester::SEASON_SPRING,
 
		      'MA' => 'may',
 
		      /* I don't know if SU is a valid Calvin Semester ID or not */
 
		      'SU' => Semester::SEASON_SUMMER);
 

	
 
  /**
 
   * The first link we start at is the one from KV into WebAdvisor.
 
   *
 
   * 1. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL
 
   * 1. https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL
 
   *    <body onload="javascript:getWindowHTML();">
 
   *
 
   *    Calls javascript:getWindowHTML(). This merely adds
 
   *    TOKENIDX=NULL to the query string, so we can skip this step
 
   *    and just have TOKENIDX=NULL.
 
   *
 
   * 2. https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL
 
   * 2. https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL&TOKENIDX=NULL
 
   *    <body onload="javascript:setWindowHTML('', '7699844013');">
 
   *
 
   *    In the above, the second argument to setWindowHTML() is
 
   *    random. Thus, we have to capture this value.
 
   */
 

	
 
  $cookies = array();
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $semesters_html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
 

	
 
  $semesters_dom = new DOMDocument();
 
  $semesters_dom->loadHTML($semesters_html);
 

	
 
  /*
 
   * Discover the available semesters
 
   */
 
  $semesters_var1 = $semesters_dom->getElementById('VAR1');
 
  if (empty($semesters_var1))
 
    {
 
      school_crawl_logf($school_crawl_log, 0, "Error: Unable to load list of semesters.");
 
      return 1;
 
    }
 
  $semesters_select_nodes = $semesters_var1->childNodes;
 
  foreach ($semesters_select_nodes as $semester_node)
 
    {
 
      if ($semester_node->tagName != 'option'
 
	  || !$semester_node->hasAttribute('value')
 
	  || !strlen($semester_node->getAttribute('value')))
 
	continue;
 

	
 
      $semester_str = $semester_node->getAttribute('value');
 

	
 
@@ -94,49 +94,49 @@ function calvin_crawl_semester_list(arra
 
      $year = $year_timespec['tm_year'] + 1900;
 

	
 
      $semester = new Semester($year, $season);
 
      $semesters[$semester_str] = $semester;
 
    }
 
  $semester = array_reverse($semesters, TRUE);
 

	
 
  return 0;
 
}
 

	
 
/**
 
 * \brief
 
 *   Crawl the courses for a semester from Calvin College.
 
 *
 
 * \param $school
 
 *   The calvin school handle.
 
 * \param $semester
 
 *   The Semester object to populate with courses.
 
 * \param $school_crawl_log
 
 *   The logger handle.
 
 */
 
function calvin_crawl_semester(array $school, Semester $semester, &$school_crawl_log)
 
{
 
  $cookies = array();
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?CONSTITUENCY=WBST&type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $uri = 'https://kvdata.calvin.edu/walive/WebAdvisor?type=P&pid=ST-WESTS12A&LASTTOKEN=NULL';
 
  $html = calvin_crawl_geturi($uri, $cookies, $school_crawl_log);
 
  $seed_dom = new DOMDocument();
 
  $seed_dom->loadHTML($html);
 
  $return_url = dom_input_value($seed_dom, 'RETURN.URL');
 

	
 
  /*
 
   * First, read all of the friendly subject/department names. They're
 
   * not in the output, but they're in the ``Subjects'' dropdown of
 
   * the input form. The <select name="LIST.VAR1_1" id="LIST_VAR1_1"/>
 
   * is associated with subjects/departments.
 
   */
 
  foreach (school_crawl_form_select_array($seed_dom->getElementById('LIST_VAR1_1')) as $department_id => $department_name)
 
    $semester->department_name_set($department_id, trim(reset($department_name)));
 

	
 
  /*
 
   * LIST.VAR<X>_<N>: <X> is the column, <N> is the row. There
 
   * are apparently a max of 5 rows (see the LIST.VAR<X>_MAX
 
   * below).
 
   *
 
   * Columns:
 
   * LIST.VAR1: department
 
   * LIST.VAR2: course_level
 
   * LIST.VAR3: IIRC, a course identifier, such as 156 from MATH-156
 
   * LIST.VAR4: I forget
 
@@ -538,51 +538,51 @@ function calvin_crawl_geturi(&$uri, arra
 
      else
 
	$uri .= '&';
 

	
 
      /* Starting value. */
 
      $uri .= 'TOKENIDX=NULL';
 
    }
 

	
 
  $token_html = calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
 

	
 
  if (!preg_match('/setWindowHTML\(\'\', \'([0-9]+)\'\);/', $token_html, $matches))
 
    return $token_html;
 
$token = $matches[1];
 

	
 
  school_crawl_logf($school_crawl_log, 7, "Using WebAdvisor token: %s.", $token);
 
  school_crawl_logf($school_crawl_log, 7, "");
 

	
 
  /*
 
   * setWindowHTML() will first remove the query string parameters
 
   * 'CLONE' and 'FORCEIDX'. Then it appends TOKENIDX=<token> to the
 
   * query parameters.
 
   *
 
   * Example, where TOKENIDX does not start out as NULL but where a
 
   * CLONE=Y command is being sent:
 
   *
 
   * Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&CONSTITUENCY=WBST&TOKENIDX=1507971558
 
   * Input: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE=Y&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=1507971558
 
   *
 
   * Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&CONSTITUENCY=WBST&TOKENIDX=2281086932
 
   * Result: HTTPS://kvdata.calvin.edu/walive/WebAdvisor?TYPE=P&PID=ST-WESTS13C&CLONE_PROCESS=Y&SPAUKQ=708501792841963&TOKENIDX=2281086932
 
   */
 
  $uri = preg_replace('/([?&])TOKENIDX=[^&]+/', '$1TOKENIDX=' . $token,
 
		      preg_replace('/([?&])(CLONE|FORCEIDX)=[^&]+&?/', '$1', $uri));
 

	
 
  return calvin_crawl_noscript_filter(school_crawl_geturi($uri, $cookies, $school_crawl_log));
 
}
 

	
 
/**
 
 * \brief
 
 *   Add a course to a semester if that semester doesn't yet have this
 
 *   course.
 
 *
 
 * \param $semester
 
 *   The semester to which the course should be appended.
 
 * \param $deparmtent
 
 *   The department of the course to add.
 
 * \param $course_id
 
 *   The course_id which, with the department string, forms a
 
 *   fully-qualified course_id.
 
 */
 
function calvin_crawl_course_add(Semester $semester, $department, $course_id, $title)
 
{
 
  if ($semester->class_get($department, $course_id) == NULL)
 
    $semester->class_add(new Course($department . '-' . $course_id, $title));
0 comments (0 inline, 0 general)