paddlenlp

Форк
0
/
multiprocess_tool.py 
102 строки · 3.2 Кб
1
#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14

15
import argparse
16
import multiprocessing
17
import os
18
import time
19
import warnings
20
from multiprocessing import Process
21

22
"""
23
Multi-process batch processing tool
24

25
This tool provides a multi-process batch processing method.
26
For example, multi-process batch download data, multi-process preprocessing data, etc.
27

28
The tool relies on executable shell commands or scripts. Its essence is to use Python's
29
multi-process library to create multiple processes, and call executable commands or
30
scripts through the os.system API.
31

32
Executable commands or scripts are passed in via a txt text file, organized by line.
33
For example, the following example is download, unzip and delete example.
34

35
batch_cmd.txt
36

37
wget http://xxxx.com/0.tar && tar -xf 0.tar && rm 0.tar
38
wget http://xxxx.com/1.tar && tar -xf 1.tar && rm 1.tar
39
...
40
wget http://xxxx.com/99.tar && tar -xf 99.tar && rm 99.tar
41

42
How to run:
43

44
python multiprocess_tool.py --num_proc 10 --shell_cmd_list_filename batch_cmd.txt
45

46
"""
47

48

49
def process_fn(cmd_list):
50
    for cmd in cmd_list:
51
        try:
52
            ret = os.system(cmd)
53
            if ret != 0:
54
                raise Exception(f"execute command: {cmd} failed.")
55
        except Exception as e:
56
            print(e)
57

58

59
def read_command(shell_cmd_list_filename):
60
    shell_cmd_list = []
61
    with open(shell_cmd_list_filename, "r") as f:
62
        for cmd in f:
63
            cmd = cmd.strip()
64
            shell_cmd_list.append(cmd)
65
    return shell_cmd_list
66

67

68
def parallel_process(cmd_list, nproc=20):
69
    if nproc > multiprocessing.cpu_count():
70
        warnings.warn(
71
            "The set number of processes exceeds the number of cpu cores, please confirm whether it is reasonable."
72
        )
73
    num_cmd = len(cmd_list)
74
    num_cmd_part = (num_cmd + nproc - 1) // nproc
75
    workers = []
76
    for i in range(min(nproc, num_cmd)):
77
        start = i * num_cmd_part
78
        end = min(start + num_cmd_part, num_cmd)
79
        p = Process(target=process_fn, args=(cmd_list[start:end],))
80
        workers.append(p)
81
        p.start()
82

83
    for p in workers:
84
        p.join()
85

86

87
def main(args):
88
    start = time.time()
89
    shell_cmd_list = read_command(args.shell_cmd_list_filename)
90
    parallel_process(shell_cmd_list, args.num_proc)
91
    end = time.time()
92
    print("Cost time: {:.2f}".format(end - start))
93

94

95
if __name__ == "__main__":
96
    parse = argparse.ArgumentParser(description="multi-process batch processing tool")
97
    parse.add_argument("--num_proc", type=int, default=20)
98
    parse.add_argument(
99
        "--shell_cmd_list_filename", type=str, help="a txt file contains shell command list to be execute."
100
    )
101
    args = parse.parse_args()
102
    main(args)
103

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.