进程丶数据共享丶锁丶进程池丶模块(爬虫)

一丶进程

  1.什么是进程

    进程是计算机中的程序关于某数据集合上的一次运行活动,是系统进行进行资源分配和调度的基本单位,是操作系统结构的基础.在早期面向进程设计的计算机结构中,进程是程序的基本执行实体;在当代面向线程设计的计算机结构中,进程是线程的容器.程序时指令丶数据及其组织形式的描述,进程是程序的实体.

  狭义定义:进程是正在运行的程序的实例.

  广义定义:进程是一个具有一定独立功能的程序关于某个数据集合的一次运行活动 .它是操作系统动态执行的基本单元,在传统的操作系统中,进程既是基本的分配单元,也是基本的执行单元.

  2.进程和线程的区别:

    1.进程是CPU资源分配的最小单元........线程是CPU计算的最小单元

    2.一个进程中可以有多个线程

    3.对于Python来说它的进程和线程和其他语言有差异,是有GIL锁,GIL锁保证一个进程中同一时刻只有一个线程被CPU调度

  3.通过继承方式创建进程

import multiprocessing
class MyProcess(multiprocessing.Process):

    def run(self):
        print('当前进程',multiprocessing.current_process())

def run():
    p1 = MyProcess()
    p1.start()

    p2 = MyProcess()
    p2.start()

if __name__ == '__main__':
    run()
# 当前进程 <MyProcess(MyProcess-1, started)>
# 当前进程 <MyProcess(MyProcess-2, started)>
View Code

  4.使用process模块创建进程

import time
from multiprocessing import Process

def f(name):
    print('hello', name)
    print('我是子进程')

if __name__ == '__main__':
    p = Process(target=f, args=('bob',))
    p.start()
    time.sleep(1)
    print('执行主进程的内容了')
View Code
import time
from multiprocessing import Process

def f(name):
    print('hello', name)
    time.sleep(1)
    print('我是子进程')


if __name__ == '__main__':
    p = Process(target=f, args=('bob',))
    p.start()
    #p.join()
    print('我是父进程')
join方法
import os
from multiprocessing import Process

def f(x):
    print('子进程id :',os.getpid(),'父进程id :',os.getppid())
    return x*x

if __name__ == '__main__':
    print('主进程id :', os.getpid())
    p_lst = []
    for i in range(5):
        p = Process(target=f, args=(i,))
        p.start()
查看进程号

  5.进程的常用功能

import time
def task(arg):
    time.sleep(2)
    print(arg)


def run():
    print('111111111')
    p1 = multiprocessing.Process(target=task,args=(1,))
    p = p1.name = 'pp1'
    print(p)
    p1.start()
    print('222222222')

    p2 = multiprocessing.Process(target=task, args=(2,))
    p2.name = 'pp2'
    p2.start()
    print('333333333')

if __name__ == '__main__':
    run()
# 111111111
# pp1
# 222222222
# 333333333
# 2
# 1
View Code

二丶数据共享

  1.进程间的数据不共享

import multiprocessing
data_list = []
def task(arg):
    data_list.append(arg)
    print(data_list)
def run():
    for i in range(10):
        p = multiprocessing.Process(target=task,args=(i,))
        p.start()

if __name__ == '__main__':
    run()
View Code

   2.进程间的数据共享multiprocessing.Queue

import multiprocessing
q = multiprocessing.Queue()

def task(arg,q):
    q.put(arg)

def run():
    for i in range(10):
        p = multiprocessing.Process(target=task, args=(i, q,))
        p.start()

    while True:
        v = q.get()
        print(v)

run()
linux
import multiprocessing
def task(arg,q):
    q.put(arg)

if __name__ == '__main__':
    q = multiprocessing.Queue()
    for i in range(10):
        p = multiprocessing.Process(target=task,args=(i,q,))
        p.start()
    while True:
        v = q.get()
        print(v)
Windows

  3.进程间的数据共享Manager

import multiprocessing
m = multiprocessing.Manager()
dic = m.dict()

def task(arg):
    dic[arg] = 100

def run():
    for i in range(10):
        p = multiprocessing.Process(target=task, args=(i,))
        p.start()

    input('>>>')
    print(dic.values())

if __name__ == '__main__':

    run()
linux
import multiprocessing
import time
def task(arg,dic):
    time.sleep(2)
    dic[arg] = 100

if __name__ == '__main__':
    m = multiprocessing.Manager()
    dic = m.dict()

    process_list = []
    for i in range(10):
        p = multiprocessing.Process(target=task, args=(i,dic,))
        p.start()

        process_list.append(p)

    while True:
        count = 0
        for p in process_list:
            if not p.is_alive():
                count += 1
        if count == len(process_list):
            break
    print(dic)
#{1: 100, 0: 100, 2: 100, 3: 100, 4: 100, 5: 100, 6: 100, 7: 100, 8: 100, 9: 100}
Windows

三丶进程锁

  线程既然有线程锁,进程肯定也有进程锁,两种锁完全一样

import time
import multiprocessing

lock = multiprocessing.RLock()

def task(arg):
    print('鬼子来了')
    lock.acquire()
    time.sleep(4)
    print(arg)
    lock.release()

if __name__ == '__main__':
    p1 = multiprocessing.Process(target=task,args=(1,))
    p1.start()

    p2 = multiprocessing.Process(target=task, args=(2,))
    p2.start()

四丶进程池

from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
import os,time
def task(n):
    time.sleep(1)
    print('[%s] is running'%os.getpid())
if __name__ == '__main__':
    p = ProcessPoolExecutor(3)
    for i in range(10):
        obj = p.submit(task,i).result()
    p.shutdown()  #相当于close和join方法

五丶初识爬虫

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor


# 模拟浏览器发送请求
# 内部创建 sk = socket.socket()
# 和抽屉进行socket连接 sk.connect(...)
# sk.sendall('...')
# sk.recv(...)

def task(url):
    print(url)
    r1 = requests.get(
        url=url,
        headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
        }
    )

    # 查看下载下来的文本信息
    soup = BeautifulSoup(r1.text,'html.parser')
    print(soup.text)

    content_list = soup.find('div',attrs={'id':'content-list'})
    for item in content_list.find_all('div',attrs={'class':'item'}):
        title = item.find('a').text.strip()
        target_url = item.find('a').get('href')
        print(title,target_url)

def run():
    pool = ThreadPoolExecutor(5)
    for i in range(1,50):
        pool.submit(task,'https://dig.chouti.com/all/hot/recent/%s' %i)

if __name__ == '__main__':
    run()
爬取抽屉的信息
原文地址:https://www.cnblogs.com/qicun/p/9636164.html