deftext2vec(text): """ text to one-hot vector :param text: source text :return: np array """ if len(text) > CAPTCHA_LENGTH: returnFalse vector = np.zeros(CAPTCHA_LENGTH * VOCAB_LENGTH) for i, c in enumerate(text): index = i * VOCAB_LENGTH + VOCAB.index(c) vector[index] = 1 return vector
defvec2text(vector): """ vector to captcha text :param vector: np array :return: text """ ifnot isinstance(vector, np.ndarray): vector = np.asarray(vector) vector = np.reshape(vector, [CAPTCHA_LENGTH, -1]) text = '' for item in vector: text += VOCAB[np.argmax(item)] return text
# generate data x and y for i in range(DATA_LENGTH): text = get_random_text() # get captcha array captcha_array = generate_captcha(text) # get vector vector = text2vec(text) data_x.append(captcha_array) data_y.append(vector)
# write data to pickle if not exists(DATA_PATH): makedirs(DATA_PATH)
x = np.asarray(data_x, np.float32) y = np.asarray(data_y, np.float32) with open(join(DATA_PATH, 'data.pkl'), 'wb') as f: pickle.dump(x, f) pickle.dump(y, f)
# input Layer with tf.variable_scope('inputs'): # x.shape = [-1, 60, 160, 3] x, y_label = iterator.get_next() keep_prob = tf.placeholder(tf.float32, []) y = tf.cast(x, tf.float32) # 3 CNN layers for _ in range(3): y = tf.layers.conv2d(y, filters=32, kernel_size=3, padding='same', activation=tf.nn.relu) y = tf.layers.max_pooling2d(y, pool_size=2, strides=2, padding='same') # y = tf.layers.dropout(y, rate=keep_prob)
# 2 dense layers y = tf.layers.flatten(y) y = tf.layers.dense(y, 1024, activation=tf.nn.relu) y = tf.layers.dropout(y, rate=keep_prob) y = tf.layers.dense(y, VOCAB_LENGTH)
NOTE: While the classes are mutually exclusive, their probabilities need not be. All that is required is that each row of labels is a valid probability distribution. If they are not, the computation of the gradient will be incorrect.
from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from urllib.parse import quote
upstream splash { server 41.159.27.223:8050 weight=4; server 41.159.27.221:8050 weight=2; server 41.159.27.9:8050 weight=2; server 41.159.117.119:8050 weight=1; }
import requests from urllib.parse import quote import re
lua = ''' function main(splash, args) local treat = require("treat") local response = splash:http_get("http://httpbin.org/get") return treat.as_string(response.body) end '''
通过 HAR 的结果可以看到,Splash 执行了整个网页的渲染过程,包括 CSS、JavaScript 的加载等过程,呈现的页面和我们在浏览器中得到的结果完全一致。
那么,这个过程由什么来控制呢?重新返回首页,可以看到实际上是有一段脚本,内容如下:
1 2 3 4 5 6 7 8 9
function main(splash, args) assert(splash:go(args.url)) assert(splash:wait(0.5)) return { html = splash:html(), png = splash:png(), har = splash:har(), } end
这个脚本实际上是用 Lua 语言写的脚本。即使不懂这个语言的语法,但从脚本的表面意思,我们也可以大致了解到它首先调用go()方法去加载页面,然后调用wait()方法等待了一定时间,最后返回了页面的源码、截图和 HAR 信息。
functionmain(splash, args) local example_urls = {"www.baidu.com", "www.taobao.com", "www.zhihu.com"} local urls = args.urls or example_urls local results = {} for index, url inipairs(urls) do local ok, reason = splash:go("http://" .. url) if ok then splash:wait(2) results[url] = splash:png() end end return results end
此属性可以设置图片是否加载,默认情况下是加载的。禁用该属性后,可以节省网络流量并提高网页加载速度。但是需要注意的是,禁用图片加载可能会影响 JavaScript 渲染。因为禁用图片之后,它的外层 DOM 节点的高度会受影响,进而影响 DOM 节点的位置。因此,如果 JavaScript 对图片节点有操作的话,其执行就会受到影响。
functionmain(splash, args) local ok, reason = splash:go{"http://httpbin.org/post", http_method="POST", body="name=Germey"} if ok then return splash:html() end end
functionmain(splash, args) localget_div_count = splash:jsfunc([[ function () { var body = document.body; var divs = body.getElementsByTagName('div'); return divs.length; } ]]) splash:go("https://www.baidu.com") return ("There are %s DIVs"):format( get_div_count()) end
functionmain(splash, args) assert(splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")) assert(splash:go("https://www.taobao.com")) local version = splash:evaljs("$.fn.jquery") return'JQuery version: ' .. version end
functionmain(splash) local treat = require('treat') assert(splash:go("http://quotes.toscrape.com/")) assert(splash:wait(0.5)) local texts = splash:select_all('.quote .text') local results = {} for index, text inipairs(texts) do results[index] = text.node.innerHTML end return treat.as_array(results) end
这里我们通过 CSS 选择器选中了节点的正文内容,随后遍历了所有节点,将其中的文本获取下来。
运行结果如下:
1 2 3 4 5 6 7 8 9 10 11
SplashResponse: Array[10] 0: "“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”" 1: "“It is our choices, Harry, that show what we truly are, far more than our abilities.”" 2: “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.” 3: "“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”" 4: "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”" 5: "“Try not to become a man of success. Rather become a man of value.”" 6: "“It is better to be hated for what you are than to be loved for what you are not.”" 7: "“I have not failed. I've just found 10,000 ways that won't work.”" 8: "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”" 9: "“A day without sunshine is like, you know, night.”"
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait
from selenium import webdriver from selenium.webdriver.common.byimportBy from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC
def get_images(json): if json.get('data'): for item in json.get('data'): title = item.get('title') images = item.get('image_detail') for image in images: yield { 'image': image.get('url'), 'title': title }
def save_image(item): ifnot os.path.exists(item.get('title')): os.mkdir(item.get('title')) try: response = requests.get(item.get('image')) if response.status_code == 200: file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg') ifnot os.path.exists(file_path): withopen(file_path, 'wb') as f: f.write(response.content) else: print('Already Downloaded', file_path) except requests.ConnectionError: print('Failed to Save Image')
最后,只需要构造一个offset数组,遍历offset,提取图片链接,并将其下载即可:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from multiprocessing.pool import Pool
def main(offset): json = get_page(offset) for item in get_images(json): print(item) save_image(item)
GROUP_START = 1 GROUP_END = 20
if __name__ == '__main__': pool = Pool() groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) pool.map(main, groups) pool.close() pool.join()
db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders') cursor = db.cursor() sql = 'CREATE TABLE IF NOT EXISTS students (id VARCHAR(255) NOT NULL, name VARCHAR(255) NOT NULL, age INT NOT NULL, PRIMARY KEY (id))' cursor.execute(sql) db.close()
html = ''' <div class="wrap"> Hello, World <p>This is a paragraph.</p> </div> ''' from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text())
from bs4 import BeautifulSoup soup = BeautifulSoup('<p>Hello</p>', 'lxml') print(soup.p.string)
在后面,Beautiful Soup的用法实例也统一用这个解析器来演示。
4. 基本用法
下面首先用实例来看看Beautiful Soup的基本用法:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
html = """ <html><head><title>The Dormouse's story</title></head> <body> <pclass="title"name="dromouse"><b>The Dormouse's story</b></p> <pclass="story">Once upon a time there were three little sisters; and their names were <ahref="http://example.com/elsie"class="sister"id="link1"><!-- Elsie --></a>, <ahref="http://example.com/lacie"class="sister"id="link2">Lacie</a> and <ahref="http://example.com/tillie"class="sister"id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <pclass="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') print(soup.prettify()) print(soup.title.string)
<html> <head> <title> The Dormouse's story </title> </head> <body> <pclass="title"name="dromouse"> <b> The Dormouse's story </b> </p> <pclass="story"> Once upon a time there were three little sisters; and their names were <aclass="sister"href="http://example.com/elsie"id="link1"> <!-- Elsie --> </a> , <aclass="sister"href="http://example.com/lacie"id="link2"> Lacie </a> and <aclass="sister"href="http://example.com/tillie"id="link3"> Tillie </a> ; and they lived at the bottom of a well. </p> <pclass="story"> ... </p> </body> </html> The Dormouse's story
html = """ <html><head><title>The Dormouse's story</title></head> <body> <pclass="title"name="dromouse"><b>The Dormouse's story</b></p> <pclass="story">Once upon a time there were three little sisters; and their names were <ahref="http://example.com/elsie"class="sister"id="link1"><!-- Elsie --></a>, <ahref="http://example.com/lacie"class="sister"id="link2">Lacie</a> and <ahref="http://example.com/tillie"class="sister"id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <pclass="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') print(soup.title) print(type(soup.title)) print(soup.title.string) print(soup.head) print(soup.p)
运行结果如下:
1 2 3 4 5
<title>The Dormouse's story</title> <class 'bs4.element.Tag'> The Dormouse's story <head><title>The Dormouse's story</title></head> <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
html = """ <html> <head> <title>The Dormouse's story</title> </head> <body> <pclass="story"> Once upon a time there were three little sisters; and their names were <ahref="http://example.com/elsie"class="sister"id="link1"> <span>Elsie</span> </a> <ahref="http://example.com/lacie"class="sister"id="link2">Lacie</a> and <ahref="http://example.com/tillie"class="sister"id="link3">Tillie</a> and they lived at the bottom of a well. </p> <pclass="story">...</p> """
运行结果如下:
1 2 3
['\n Once upon a time there were three little sisters; and their names were\n ', <aclass="sister"href="http://example.com/elsie"id="link1"> <span>Elsie</span> </a>, '\n', <aclass="sister"href="http://example.com/lacie"id="link2">Lacie</a>, ' \n and\n ', <aclass="sister"href="http://example.com/tillie"id="link3">Tillie</a>, '\n and they lived at the bottom of a well.\n ']
html = """ <html> <head> <title>The Dormouse's story</title> </head> <body> <pclass="story"> Once upon a time there were three little sisters; and their names were <ahref="http://example.com/elsie"class="sister"id="link1"> <span>Elsie</span> </a> </p> <pclass="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') print(soup.a.parent)
运行结果如下:
1 2 3 4 5 6
<pclass="story"> Once upon a time there were three little sisters; and their names were <aclass="sister"href="http://example.com/elsie"id="link1"> <span>Elsie</span> </a> </p>
html = """ <html> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"> <span>Elsie</span> </a> Hello <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a> and they lived at the bottom of a well. </p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') print('Next Sibling', soup.a.next_sibling) print('Prev Sibling', soup.a.previous_sibling) print('Next Siblings', list(enumerate(soup.a.next_siblings))) print('Prev Siblings', list(enumerate(soup.a.previous_siblings)))
运行结果如下:
1 2 3 4 5 6 7 8
Next Sibling Hello
Prev Sibling Once upon a time there were three little sisters; and their names were
Next Siblings [(0, '\n Hello\n '), (1, <a class="sister"href="http://example.com/lacie"id="link2">Lacie</a>), (2, ' \n and\n '), (3, <a class="sister"href="http://example.com/tillie"id="link3">Tillie</a>), (4, '\n and they lived at the bottom of a well.\n ')] Prev Siblings [(0, '\n Once upon a time there were three little sisters; and their names were\n ')]
html = """ <html> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Bob</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> </p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') print('Next Sibling:') print(type(soup.a.next_sibling)) print(soup.a.next_sibling) print(soup.a.next_sibling.string) print('Parent:') print(type(soup.a.parents)) print(list(soup.a.parents)[0]) print(list(soup.a.parents)[0].attrs['class'])
运行结果如下:
1 2 3 4 5 6 7 8 9 10 11
NextSibling: <class 'bs4.element.Tag'> <aclass="sister" href="http://example.com/lacie" id="link2">Lacie</a> Lacie Parent: <class 'generator'> <pclass="story"> Onceuponatimetherewerethreelittlesisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Bob</a><a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> </p> ['story']
for ul in soup.find_all(name='ul'): print(ul.find_all(name='li')) for li in ul.find_all(name='li'): print(li.string)
运行结果如下:
1 2 3 4 5 6 7
[<li class="element">Foo</li>, <liclass="element">Bar</li>, <liclass="element">Jay</li>] Foo Bar Jay [<liclass="element">Foo</li>, <liclass="element">Bar</li>] Foo Bar
import re html=''' <div class="panel"> <div class="panel-body"> <a>Hello, this is a link</a> <a>Hello, this is a link, too</a> </div> </div> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') print(soup.find_all(text=re.compile('link')))
运行结果如下:
1
['Hello, this is a link', 'Hello, this is a link, too']
from bs4 import BeautifulSoup soup = BeautifulSoup(html, 'lxml') for li in soup.select('li'): print('Get Text:', li.get_text()) print('String:', li.string)
运行结果如下:
1 2 3 4 5 6 7 8 9 10
GetText: Foo String: Foo GetText: Bar String: Bar GetText: Jay String: Jay GetText: Foo String: Foo GetText: Bar String: Bar
from lxml import etree html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//*') print(result)
运行结果如下:
1
[<Element html at0x10510d9c8>, <Element body at0x10510da08>, <Element divat0x10510da48>, <Element ul at0x10510da88>, <Element li at0x10510dac8>, <Element a at0x10510db48>, <Element li at0x10510db88>, <Element a at0x10510dbc8>, <Element li at0x10510dc08>, <Element a at0x10510db08>, <Element li at0x10510dc48>, <Element a at0x10510dc88>, <Element li at0x10510dcc8>, <Element a at0x10510dd08>]
from lxml import etree text = ''' <li class="li li-first"><a href="link.html">first item</a></li> ''' html = etree.HTML(text) result = html.xpath('//li[@class="li"]/a/text()') print(result)
from lxml import etree text = ''' <li class="li li-first" name="item"><a href="link.html">first item</a></li> ''' html = etree.HTML(text) result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()') print(result)
text = ''' <div> <ul> <liclass="item-0"><a href="link1.html"><span>first item</span></a></li> <liclass="item-1"><a href="link2.html">second item</a></li> <liclass="item-inactive"><a href="link3.html">third item</a></li> <liclass="item-1"><a href="link4.html">fourth item</a></li> <liclass="item-0"><a href="link5.html">fifth item</a> </ul> </div> ''' html = etree.HTML(text) result = html.xpath('//li[1]/ancestor::*') print(result) result = html.xpath('//li[1]/ancestor::div') print(result) result = html.xpath('//li[1]/attribute::*') print(result) result = html.xpath('//li[1]/child::a[@href="link1.html"]') print(result) result = html.xpath('//li[1]/descendant::span') print(result) result = html.xpath('//li[1]/following::*[2]') print(result) result = html.xpath('//li[1]/following-sibling::*') `print(result)`
运行结果如下:
1 2 3 4 5 6 7
[<Element html at 0x107941808>, <Element body at 0x1079418c8>, <Element div at 0x107941908>, <Element ul at 0x107941948>] [<Element div at 0x107941908>] ['item-0'] [<Element a at 0x1079418c8>] [<Element span at 0x107941948>] [<Element a at 0x1079418c8>] [<Element li at 0x107941948>, <Element li at 0x107941988>, <Element li at 0x1079419c8>, <Element li at 0x107941a08>]
defwrite_to_json(content): with open('result.txt', 'a') as f: print(type(json.dumps(content))) f.write(json.dumps(content, ensure_ascii=False,).encode('utf-8'))
defwrite_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n')
defmain(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item)
if __name__ == '__main__': for i in range(10): main(offset=i * 10) time.sleep(1)
还是上面的 HTML 文本,如果想获取所有a节点的超链接、歌手和歌名,就可以将search()方法换成findall()方法。如果有返回结果的话,就是列表类型,所以需要遍历一下来依次获取每组内容。代码如下:
1 2 3 4 5 6
results = re.findall('<li.*?href="(.*?)".*?singer="(.*?)">(.*?)</a>', html, re.S) print(results) print(type(results)) for result in results: print(result) print(result[0], result[1], result[2])
前面我们使用 urllib 处理过 Cookies,写法比较复杂,而有了 requests,获取和设置 Cookies 只需一步即可完成。
我们先用一个实例看一下获取 Cookies 的过程:
1 2 3 4 5 6
import requests
r = requests.get("https://www.baidu.com") print(r.cookies) forkey, value in r.cookies.items(): print(key + '=' + value)
运行结果如下:
1 2 3
<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>, <Cookie __bsi=13533594356813414194_00_14_N_N_2_0303_C02F_N_N_N_0 for .www.baidu.com/>]> BDORZ=27315 __bsi=13533594356813414194_00_14_N_N_2_0303_C02F_N_N_N_0
cookies = 'q_c1=31653b264a074fc9a57816d1ea93ed8b|1474273938000|1474273938000; d_c0="AGDAs254kAqPTr6NW1U3XTLFzKhMPQ6H_nc=|1474273938"; __utmv=51854390.100-1|2=registration_date=20130902=1^3=entry_date=20130902=1;a_t="2.0AACAfbwdAAAXAAAAso0QWAAAgH28HQAAAGDAs254kAoXAAAAYQJVTQ4FCVgA360us8BAklzLYNEHUd6kmHtRQX5a6hiZxKCynnycerLQ3gIkoJLOCQ==";z_c0=Mi4wQUFDQWZid2RBQUFBWU1DemJuaVFDaGNBQUFCaEFsVk5EZ1VKV0FEZnJTNnp3RUNTWE10ZzBRZFIzcVNZZTFGQmZn|1474887858|64b4d4234a21de774c42c837fe0b672fdb5763b0' jar = requests.cookies.RequestsCookieJar() headers = { 'Host': 'www.zhihu.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' } for cookie in cookies.split(';'): key, value = cookie.split('=', 1) jar.set(key, value) r = requests.get("http://www.zhihu.com", cookies=jar, headers=headers) print(r.text)
使用get()方法成功实现一个 GET 请求,这倒不算什么,更方便之处在于其他的请求类型依然可以用一句话来完成,示例如下:
1 2 3 4 5
r = requests.post('http://httpbin.org/post') r = requests.put('http://httpbin.org/put') r = requests.delete('http://httpbin.org/delete') r = requests.head('http://httpbin.org/get') r = requests.options('http://httpbin.org/get')
r = requests.get("https://www.zhihu.com/explore") print(r.text)
运行结果如下:
1 2 3
<html><body><h1>500 Server Error</h1> An internal server error occured. </body></html>
但如果加上headers并加上User-Agent信息,那就没问题了:
1 2 3 4 5 6 7
import requests
headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } r = requests.get("https://www.zhihu.com/explore", headers=headers) print(r.text)
当然,我们可以在headers这个参数中任意添加其他的字段信息。
4. POST 请求
前面我们了解了最基本的 GET 请求,另外一种比较常见的请求方式是 POST。使用requests实现 POST 请求同样非常简单,示例如下:
1 2 3 4 5
import requests
data = {'name': 'germey', 'age': '22'} r = requests.post("http://httpbin.org/post", data=data) print(r.text)
在浏览网站的过程中,我们经常会遇到需要登录的情况,有些页面只有登录之后才可以访问,而且登录之后可以连续访问很多次网站,但是有时候过一段时间就需要重新登录。还有一些网站,在打开浏览器时就自动登录了,而且很长时间都不会失效,这种情况又是为什么?其实这里面涉及会话和 Cookies 的相关知识,本节就来揭开它们的神秘面纱。
1. 静态网页和动态网页
在开始之前,我们需要先了解一下静态网页和动态网页的概念。这里还是前面的示例代码,内容如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
<!DOCTYPE html> <html> <head> <metacharset="UTF-8"> <title>This is a Demo</title> </head> <body> <divid="container"> <divclass="wrapper"> <h2class="title">Hello World</h2> <pclass="text">Hello, this is a paragraph.</p> </div> </div> </body> </html>
这是最基本的 HTML 代码,我们将其保存为一个.html 文件,然后把它放在某台具有固定公网 IP 的主机上,主机上装上 Apache 或 Nginx 等服务器,这样这台主机就可以作为服务器了,其他人便可以通过访问服务器看到这个页面,这就搭建了一个最简单的网站。
这种网页的内容是 HTML 代码编写的,文字、图片等内容均通过写好的 HTML 代码来指定,这种页面叫作静态网页。它加载速度快,编写简单,但是存在很大的缺陷,如可维护性差,不能根据 URL 灵活多变地显示内容等。例如,我们想要给这个网页的 URL 传入一个name参数,让其在网页中显示出来,是无法做到的。
而在 Web 中,会话对象用来存储特定用户会话所需的属性及配置信息。这样,当用户在应用程序的 Web 页之间跳转时,存储在会话对象中的变量将不会丢失,而是在整个用户会话中一直存在下去。当用户请求来自应用程序的 Web 页时,如果该用户还没有会话,则 Web 服务器将自动创建一个会话对象。当会话过期或被放弃后,服务器将终止该会话。
(2) Cookies
Cookies 指某些网站为了辨别用户身份、进行会话跟踪而存储在用户本地终端上的数据。
会话维持
那么,我们怎样利用 Cookies 保持状态呢?当客户端第一次请求服务器时,服务器会返回一个请求头中带有Set-Cookie字段的响应给客户端,用来标记是哪一个用户,客户端浏览器会把 Cookies 保存起来。当浏览器下一次再请求该网站时,浏览器会把此 Cookies 放到请求头一起提交给服务器,Cookies 携带了会话 ID 信息,服务器检查该 Cookies 即可找到对应的会话是什么,然后再判断会话来以此来辨认用户状态。
在成功登录某个网站时,服务器会告诉客户端设置哪些 Cookies 信息,在后续访问页面时客户端会把 Cookies 发送给服务器,服务器再找到对应的会话加以判断。如果会话中的某些设置登录状态的变量是有效的,那就证明用户处于登录状态,此时返回登录之后才可以查看的网页内容,浏览器再进行解析便可以看到了。
反之,如果传给服务器的 Cookies 是无效的,或者会话已经过期了,我们将不能继续访问页面,此时可能会收到错误的响应或者跳转到登录页面重新登录。
但是当我们关闭浏览器时,浏览器不会主动在关闭之前通知服务器它将要关闭,所以服务器根本不会有机会知道浏览器已经关闭。之所以会有这种错觉,是因为大部分会话机制都使用会话 Cookie 来保存会话 ID 信息,而关闭浏览器后 Cookies 就消失了,再次连接服务器时,也就无法找到原来的会话了。如果服务器设置的 Cookies 保存到硬盘上,或者使用某种手段改写浏览器发出的 HTTP 请求头,把原来的 Cookies 发送给服务器,则再次打开浏览器,仍然能够找到原来的会话 ID,依旧还是可以保持登录状态的。