码墨 2020-02-14
对前几篇文章的代码进行梳理,形成4个通用型函数:
1 直接Get或Post,通常会被网站限制访问;
2 带headers进行Get或Post,模拟了浏览器,通常可以正常访问。
代码(注意由于下面的代码中设置http header时有*/*,造成代码的显示不太正常,但不影响):
//Header是直接从chrome console中复制的view source形式的Request Headers,注意只包括以冒号分割的内容。
//FormData也是直接从chrome console中复制的view source形式的Form Data
//1:Get方式:ms.Get() 需要先设置ms.Url
//2: GetWitdHeader方式:ms.GetWitdHeader() 需要先设置ms.Url, Header
//3: Post方式:ms.Post() 需要先设置ms.Url, FormData
//4: PostWitdHeader方式:ms.PostWitdHeader() 需要先设置ms.Url, Header, FormData
//如遇GBK乱码,请参考https://www.cnblogs.com/pu369/p/12228659.html
package main
import (
"fmt"
"io/ioutil"
"net/http"
"net/http/cookiejar"
"strings"
"golang.org/x/text/encoding/simplifiedchinese"
)
type MySpider struct {
Url, Header, FormData string
Client *http.Client
}
func main() {
//爬虫实例
ms := NewMySpider()
//访问首页
ms.Url = "http://192.168.132.80/wui/theme/ecology7/page/login.jsp"
fmt.Println(ms.Get())
//上一行代码未设置header就直接GET时,网页返回包含XSS的字符串。而直接用浏览器可以正常访问,说明需要设置header
ms.Url = "http://192.168.132.80/wui/theme/ecology7/page/login.jsp"
ms.Header = `Host: 192.168.132.80
Connection: keep-alive
Pragma: no-cache
Cache-Control: no-cache
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,zh;q=0.9
Cookie: JSESSIONID=abcIswHnk9uU49ql9MP2w; testBanCookie=test; loginfileweaver=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D; languageidweaver=7; loginidweaver=114`
fmt.Println(ms.GetWitdHeader())
//上一行代码设置header后再GET,就可以返回与浏览器访问相同的网页源代码
//访问登录页
//网站Form表单对应的是:http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3,然而做了302跳转到了
//http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3,用这两个Url做POST访问的效果都是一样的。
ms.Url = "http://192.168.132.80/login/RemindLogin.jsp?RedirectFile=/portal/plugin/homepage/ecology7theme/index.jsp?templateId=3"
ms.FormData = `loginfile=%2Fwui%2Ftheme%2Fecology7%2Fpage%2Flogin.jsp%3FtemplateId%3D6%26logintype%3D1%26gopage%3D&logintype=1&fontName=%CE%A2%EF%BF%BD%EF%BF%BD%EF%BF%BD%C5%BA%EF%BF%BD&message=16&gopage=&formmethod=post&rnd=&serial=&username=&isie=true&loginid=admin&userpassword=1234&submit=`
fmt.Println(ms.Post())
//上一行代码未携带header就直接POST时,网页返回包含XSS的字符串。说明需要携带header才能访问
//前面已设置了Url,Header,FormData,所以下面直接用ms.PostWitdHeader(),证明是可以登录成功的
fmt.Println(ms.PostWitdHeader())
//现在访问登录后才允许访问的页面,证明无需携带Header也可以访问
ms.Url = "http://192.168.132.80/CRM/data/CustomerBrowser.jsp?splitflag="
fmt.Println(ms.Get())
}
//1:Get方式,ms.Get() 需要先设置ms.Url
func (this MySpider) Get() string {
resp, err := this.Client.Get(this.Url)
defer resp.Body.Close()
reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
body, err := ioutil.ReadAll(reader)
if err != nil {
// handle error
}
return string(body)
}
//2: GetWitdHeader方式,ms.GetWitdHeader() 需要先设置ms.Url, Header
func (this MySpider) GetWitdHeader() string {
req, err := http.NewRequest("GET", this.Url, nil)
if err != nil {
// handle error
}
//将传入的Header分割成[]ak和[]av
a := strings.Split(this.Header, "\n")
ak := make([]string, len(a[:]))
av := make([]string, len(a[:]))
//要用copy复制值;若用等号仅表示指针,会造成修改ak也就是修改了av
copy(ak, a[:])
copy(av, a[:])
//fmt.Println(ak[0], av[0])
for k, v := range ak {
i := strings.Index(v, ":")
j := i + 1
ak[k] = v[:i]
av[k] = v[j:]
//设置Header
req.Header.Set(ak[k], av[k])
}
resp, err := this.Client.Do(req)
defer resp.Body.Close()
reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
body, err := ioutil.ReadAll(reader)
if err != nil {
// handle error
}
return string(body)
}
//3: Post方式,ms.Post() 需要先设置ms.Url, FormData
func (this MySpider) Post() string {
resp, err := this.Client.Post(this.Url, "application/x-www-form-urlencoded", strings.NewReader(this.FormData))
defer resp.Body.Close()
reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
body, err := ioutil.ReadAll(reader)
if err != nil {
// handle error
}
return string(body)
}
//4: PostWitdHeader方式,ms.PostWitdHeader() 需要先设置ms.Url, Header, FormData
func (this MySpider) PostWitdHeader() string {
req, err := http.NewRequest("POST", this.Url, strings.NewReader(this.FormData))
if err != nil {
// handle error
}
//将传入的Header分割成[]ak和[]av
a := strings.Split(this.Header, "\n")
ak := make([]string, len(a[:]))
av := make([]string, len(a[:]))
//要用copy复制值;若用等号仅表示指针,会造成修改ak也就是修改了av
copy(ak, a[:])
copy(av, a[:])
//fmt.Println(ak[0], av[0])
for k, v := range ak {
i := strings.Index(v, ":")
j := i + 1
ak[k] = v[:i]
av[k] = v[j:]
//设置Header
req.Header.Set(ak[k], av[k])
}
resp, err := this.Client.Do(req)
defer resp.Body.Close()
reader := simplifiedchinese.GB18030.NewDecoder().Reader(resp.Body)
body, err := ioutil.ReadAll(reader)
if err != nil {
// handle error
}
return string(body)
}
//构造函数
func NewMySpider() *MySpider {
var Client http.Client
jar, err := cookiejar.New(nil)
if err != nil {
panic(err)
}
Client.Jar = jar
return &MySpider{
Client: &Client,
}
}