本文的程序是使用go语言编写的网络爬虫程序,目标是从某大型教育网站上批量抓取国内所有大学的信息,3000多个大学的信息可并行抓取,可控制并行度及间隔时间。 由于我们主要是研究编程方法,因此为了避免对此教育网站造成压力,代码中将域名更改为test.com

主要过程如下

执行主函数main

按照学校的编号循环抓取每个学校的网页内容

  • 获得学校名称,例如:北京大学
  • 获得学校所在地,例如:北京
  • 获得院校类型,例如:综合类
  • 获得学历层次,例如:普通本科、211、985等
  • 获得招生电话
  • 获得通讯地址
  • 获得官网地址
  • 获得学校简介
  • 获得专业列表
  • 获得专业介绍

本代码使用了GO语言的主要功能

  • os,建立data目录等
  • time,实现延时
  • io,实现文件的写入
  • fmt,实现格式输出
  • net/http,实现网页内容的下载
  • regexp,实现正则表达式对网页内容的抓取
  • 使用了go语句,进行函数并发执行

代码分享

代码中有详细的注释,代码如下:

/*
名称:网络爬虫程序
用途:从某大型教育网站抓取大学的信息
作者:腾达格尔
版本:Ver 1.00 Go语言版本:1.3
时间:2014-09-01
*/
package main

import (
    "fmt"
    "io/ioutil"
    "net/http"
    "os"
    "regexp"
    "time"
)

var (
    tag_BrTag = regexp.MustCompile(`<br>|(</br>)|(<br/>)`)
    tag_PTag  = regexp.MustCompile(`<p>|(</p>)|(&rdquo)|(&ldquo)|(&mdash)|(</strong>)`)

    tag_Nbsp = regexp.MustCompile(`&nbsp;`)

    tag_HTMLTag     = regexp.MustCompile(`(?s)</?.*?>`)
    tag_Space       = regexp.MustCompile(`(^\s+)|( )|(\r\n)|(\t)`)
    tag_SchoolName  = regexp.MustCompile(`\svar\sschoolname='(.+)';\s`)
    tag_SchoolJjUrl = regexp.MustCompile(`<a href="(.+)">学校简介</a>`)
    tag_SchoolJj    = regexp.MustCompile(`<divclass="txt_leftline_24">(.+)</p><scripttype="text/javascript">`)

    //匹配专业列表
    tag_SpecialtyItem = regexp.MustCompile(`<a href="/schoolhtm/specialty/(.+\.htm)" >(.+)</a>`)

    //读取专业内容
    tag_SpecialtyContent = regexp.MustCompile(`<divclass="txt_leftline_24"><p><strong>(.+)</p><scripttype="text/javascript">`)

    //学历层次
    tag_XueLiCengCi = regexp.MustCompile(`<p>学历层次:(.+)<p>招生电话:`)

    //学历层次项目
    tag_XueLiCengCiItem = regexp.MustCompile(`\[(.*?)\]`)

    //招生电话
    tag_Zsdh = regexp.MustCompile(`<p>招生电话:(.+)</p><p>通讯地址:`)

    //通信地址
    tag_Txdz = regexp.MustCompile(`<p>通讯地址:(.+)</p><p>电子邮箱:`)

    //官网地址
    tag_Gwdz = regexp.MustCompile(`<p>官网地址:<ahref="(.+)"target="_blank"class="blue_12">http:`)

    //所在地
    tag_Szd = regexp.MustCompile(`<p>所在地:(.+)院校类型:`)
    //院校类型
    tag_Yxlx = regexp.MustCompile(`院校类型:(.+)</p><p>学历层次:`)
)

//根据网址获得网页内容
func Get(url string) (content string, statusCode int) {
    resp, err1 := http.Get(url)
    if err1 != nil {
        statusCode = -100
        return
    }
    defer resp.Body.Close()
    data, err2 := ioutil.ReadAll(resp.Body)
    if err2 != nil {
        statusCode = -200
        return
    }
    statusCode = resp.StatusCode
    content = string(data)
    return
}

//获得学校首页内容
func GetSchoolHome(code string) (school_home string, ret int) {
    school_url := "http://gkcx.test.com/schoolhtm/schoolTemple/school" + code + ".htm"
    s, statusCode := Get(school_url)
    if statusCode != 200 {
        school_home = ""
        ret = 1
        return
    }
    school_home = s
    ret = 0
    return
}

//获取学校名称
func GetSchoolName(school_info string) (school_name string, xlcc_merge string, zsdh string, txdz string, gwdz string, szd string, yxlx string, ret int) {

    match := tag_SchoolName.FindStringSubmatch(school_info)
    if match != nil {
        school_name = match[1]
        ret = 0
    } else {
        school_name = ""
        ret = 1
    }

    school_info_temp := school_info

    school_info_temp = tag_Space.ReplaceAllString(school_info_temp, "")
    //fmt.Println(school_info_temp)

    //学历层次
    xlcc := ""
    match1 := tag_XueLiCengCi.FindStringSubmatch(school_info_temp)
    if match1 != nil {
        xlcc = match1[1]

    } else {
        xlcc = ""

    }

    matches := tag_XueLiCengCiItem.FindAllStringSubmatch(xlcc, 10000)
    for _, item := range matches {
        xlcc_merge += item[1] + ","

    }
    //fmt.Println("学历层次:",xlcc_merge)

    //招生电话

    matches = tag_Zsdh.FindAllStringSubmatch(school_info_temp, 10000)
    for _, item := range matches {
        zsdh = item[1]

    }
    //fmt.Println("招生电话:",zsdh)

    //所在地
    matches = tag_Szd.FindAllStringSubmatch(school_info_temp, 10000)
    for _, item := range matches {
        szd = item[1]
        os.MkdirAll("data/"+szd, 0666)
    }

    //院校类型
    matches = tag_Yxlx.FindAllStringSubmatch(school_info_temp, 10000)
    for _, item := range matches {
        yxlx = item[1]

    }

    //通信地址

    matches = tag_Txdz.FindAllStringSubmatch(school_info_temp, 10000)
    for _, item := range matches {
        txdz = item[1]

    }
    //fmt.Println("通信地址:",txdz)

    //官网地址

    matches = tag_Gwdz.FindAllStringSubmatch(school_info_temp, 10000)
    for _, item := range matches {
        gwdz = item[1]

    }
    //fmt.Println("官网地址:",gwdz)

    return
}

//获得学校简介
func GetSchoolJj(school_home string) (school_jj string, ret int) {

    match := tag_SchoolJjUrl.FindStringSubmatch(school_home)
    if match != nil {
        school_jj = match[1]
        ret = 0
    } else {
        school_jj = ""
        ret = 1
    }
    school_jj = "http://gkcx.test.com" + school_jj
    //fmt.Println(school_jj)
    s, statusCode := Get(school_jj)
    if statusCode != 200 {
        school_jj = ""
        ret = 1
        return
    }
    school_jj = s
    school_jj = tag_Space.ReplaceAllString(school_jj, "")

    //fmt.Println(school_jj)
    match_jj := tag_SchoolJj.FindStringSubmatch(school_jj)
    if match_jj != nil {
        school_jj = match_jj[1]
        school_jj = tag_PTag.ReplaceAllString(school_jj, "")
        school_jj = tag_BrTag.ReplaceAllString(school_jj, "\r\n")

        ret = 0
    } else {
        school_jj = ""
        ret = 1
    }

    return
}

//专业列表结构体
type SpecialtyItem struct {
    url   string
    title string
}

//获得专业列表
func findSpecialty(code string) (specialty []SpecialtyItem, err error) {

    school_url := "http://gkcx.test.com/schoolhtm/specialty/specialtyList/specialty" + code + ".htm"
    content, statusCode := Get(school_url)
    if statusCode != 200 {
        return
    }

    matches := tag_SpecialtyItem.FindAllStringSubmatch(content, 10000)
    specialty = make([]SpecialtyItem, len(matches))
    for i, item := range matches {
        specialty[i] = SpecialtyItem{"http://gkcx.test.com/schoolhtm/specialty/" + item[1], item[2]}

    }
    return
}

//获得专业内容
func readSpecialty(url string) (content string) {

    content, statusCode := Get(url)
    if statusCode != 200 {
        content = ""
        return
    }
    content = tag_Space.ReplaceAllString(content, "")
    //fmt.Println(content)
    match_jj := tag_SpecialtyContent.FindStringSubmatch(content)
    if match_jj != nil {
        content = match_jj[1]
        content = tag_PTag.ReplaceAllString(content, "")
        content = tag_BrTag.ReplaceAllString(content, "\r\n")
        content = tag_Nbsp.ReplaceAllString(content, " ")

    } else {
        content = ""

    }
    return
}

func GetSchoolInfo(code string) {
    school_home, ret1 := GetSchoolHome(code)
    if ret1 != 0 {
        fmt.Println("Error to get school Home Infomation!", code)
        return
    }

    //获得学校名称
    school_name, xlcc, zsdh, txdz, gwdz, szd, yxlx, ret2 := GetSchoolName(school_home)
    if ret2 != 0 {
        fmt.Println("Error to get school Infomation!", code)
        return
    }

    fmt.Println(szd, school_name, code)
    fileName := fmt.Sprintf("data/%s/列表_%s.txt", szd, school_name)

    //获得学校简介
    school_jj, ret3 := GetSchoolJj(school_home)
    if ret3 != 0 {
        fmt.Println(`Error to GetSchoolJj!`)
        return
    }
    //fmt.Println(school_name,school_jj)

    cnt := fmt.Sprintf("学校名称:%s\n所在地:%s\n院校类型:%s\n学历层次:%s\n招生电话:%s\n通讯地址:%s\n官网地址:%s\n学校简介:\n%s", school_name, szd, yxlx, xlcc, zsdh, txdz, gwdz, school_jj)
    ioutil.WriteFile(fileName, []byte(cnt), 0644)

    //获得专业列表
    specialty, _ := findSpecialty(code)
    for _, item := range specialty {
        //fmt.Printf("获得专业 %s 的内容,来自 %s\n", item.title, item.url)
        specialtyContent := readSpecialty(item.url)
        //fmt.Println(item.title,"====\r\n",specialtyContent)
        fileName := fmt.Sprintf("data/%s/列表_%s_专业_%s.txt", szd, school_name, item.title)
        cnt := fmt.Sprintf("%s 专业: %s \n\n%s", school_name, item.title, specialtyContent)
        ioutil.WriteFile(fileName, []byte(cnt), 0644)
    }
}

//主程序
func main() {
    //根据学校编号遍历所有大学,有些编号可能会不存在
    for i := 30; i < 4000; i++ {
        code := fmt.Sprintf("%d", i)
        if i > 30 && i%50 == 0 {
            fmt.Println("----------------------------")
            time.Sleep(10 * 1e9) //为了减轻网站的负担,每隔50个网站睡眠10秒

        }
        go GetSchoolInfo(code) //并发获得学校的信息
    }

}

附录

关于GO语言

Go语言是谷歌2009发布的第二款开源编程语言。 Go语言专门针对多处理器系统应用程序的编程进行了优化,使用Go编译的程序可以媲美C或C++代码的速度,而且更加安全、支持并行进程。

GO语言官网

http://www.golang.org ,让人无法理解的是,天 朝 居然屏蔽了这个技术网站,我是一直上不去。



blog comments powered by Disqus

Published

2014-09-01

Categories


Tags