Golang 爬虫爬取贴吧漫画

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"os"
	"regexp"
	"runtime"
	"strconv"
	"sync"
)

// 正则匹配规则
var (
	contentRegexp = // regexp.MustCompile(`class="BDE_Image" pic_type=".*?src="(.*?)"`)
	// 删除上面斜杠
)

// 获取HTML页面
func GetContent(url string) (content string, statusCode int) {
	resp, err := http.Get(url)
	defer resp.Body.Close()
	if err != nil {
		statusCode = 404
		return
	}

	data, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		statusCode = 503
		return
	}

	statusCode = resp.StatusCode
	content = string(data)
	return
}

// 获取HTML页面中的漫画url链接
func FindURLList(content string) (urlList []string) {
	var allList [][]string
	allList = contentRegexp.FindAllStringSubmatch(content, -1)
	for _, item := range allList {
		urlList = append(urlList, item[1])
	}
	return
}

// 下载图片
func DownloadPicture(wg *sync.WaitGroup, index int, item string, episode string) {
	resp, err := http.Get(item)
	if err != nil {
		fmt.Println("The " + episode + "-" + strconv.Itoa(index) + ".png" + " download failed")
		wg.Done()
		return
	}

	data, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		fmt.Println("The " + episode + "-" + strconv.Itoa(index) + ".png" + " download failed")
		wg.Done()
		return
	}

	ioutil.WriteFile(episode+"-"+strconv.Itoa(index)+".png", []byte(data), 0644)
	resp.Body.Close()
	wg.Done()
}

func main() {
	runtime.GOMAXPROCS(runtime.NumCPU())
	fmt.Println("Please input your URL: ")
	var url string
	fmt.Scanln(&url)

	fmt.Println("Please input your episode number: ")
	var episode string
	fmt.Scanln(&episode)

	os.Mkdir("./"+"conan-"+episode, 0755)
	os.Chdir("./" + "conan-" + episode)

	htmlContent, statusCode := GetContent(url)
	if statusCode >= 400 {
		fmt.Println("Failed url.")
		return
	}

	urlList := FindURLList(htmlContent)

	wg := sync.WaitGroup{}
	wg.Add(len(urlList))

	fmt.Println("Picture is downloading, wait a minute..")
	for index, item := range urlList {
		go DownloadPicture(&wg, index, item, episode)
	}
	wg.Wait()
}

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注

开始在上面输入您的搜索词,然后按回车进行搜索。按ESC取消。

返回顶部