脚本 script 常用脚本

remove_all_pyc

find . -name "*.pyc" -exec git rm -f {} ;

find_all_links

import requests
import re

# get url
url = input('Enter a URL (include `http://`): ')

# connect to the url
website = requests.get(url)

# read html
html = website.text

# use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)

# output links
for link in links:
    print(link[0])

rename_with_slice

import os
import glob

os.chdir("/Users/mikeherman/repos/bugs/se-platform/se/core/permissions")
for file in glob.glob("*.json"):
    file_name = os.path.splitext(file)[0]
    extension = os.path.splitext(file)[1]
    new_file_name = file_name[:-6] + extension
    try:
        os.rename(file, new_file_name)
    except OSError as e:
        print(e)
    else:
        print("Renamed {} to {}".format(file, new_file_name))

load_json_without_dupes

def dict_raise_on_duplicates(ordered_pairs):
    """reject duplicate keys"""
    my_dict = dict()
    for key, values in ordered_pairs:
        if key in my_dict:
            raise ValueError("Duplicate key: {}".format(key,))
        else:
            my_dict[key] = values
    return my_dict

execution_time

"""
ExecutionTime
This class is used for timing execution of code.
For example:
    timer = ExecutionTime()
    print 'Hello world!'
    print 'Finished in {} seconds.'.format(timer.duration())
"""


import time
import random


class ExecutionTime:
    def __init__(self):
        self.start_time = time.time()

    def duration(self):
        return time.time() - self.start_time


# ---- run code ---- #


timer = ExecutionTime()
sample_list = list()
my_list = [random.randint(1, 888898) for num in
           range(1, 1000000) if num % 2 == 0]
print('Finished in {} seconds.'.format(timer.duration()))

benchmark_permissions_loading_django

import os
import time
import numpy

# temp file for benchmarking


def timeit(method):

    def timed(*args, **kw):
        ts = time.time()

        result = method(*args, **kw)
        te = time.time()
        all_times.append(te - ts)

        print(all_times)
        print(numpy.mean(all_times))
        return result

    return timed


def create_new_db():
    os.system("mysqladmin -u root drop DATABASE_NAME -f")
    os.system("mysqladmin -u root create DATABASE_NAME -f")
    os.system("./manage.py syncdb")
    os.system("./manage.py migrate")


@timeit
def load_new_perms():
    os.system("./manage.py LOAD_PERMS_COMMAND")


if __name__ == "__main__":
    n = 0
    all_times = list()
    while n < 10:
        create_new_db()
        load_new_perms()
        n += 1

basic_email_web_crawler

import requests
import re

# get url
url = input('Enter a URL (include `http://`): ')

# connect to the url
website = requests.get(url)

# read html
html = website.text

# use re.findall to grab all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
emails = re.findall('([w.,]+@[w.,]+.w+)', html)


# print the number of links in the list
print("
Found {} links".format(len(links)))
for email in emails:
    print(email)

basic_link_web_crawler


import requests
import re
try:
    from urllib.parse import urljoin
except ImportError:
    from urlparse import urljoin

# regex
link_re = re.compile(r'href="(.*?)"')


def crawl(url):

    req = requests.get(url)

    # Check if successful
    if(req.status_code != 200):
        return []

    # Find links
    links = link_re.findall(req.text)

    print("
Found {} links".format(len(links)))

    # Search links for emails
    for link in links:

        # Get an absolute URL for a link
        link = urljoin(url, link)

        print(link)

if __name__ == '__main__':
    crawl('http://www.realpython.com')

find_files_recursively

import fnmatch
import os

# constants
PATH = './'
PATTERN = '*.md'


def get_file_names(filepath, pattern):
    matches = []
    if os.path.exists(filepath):
        for root, dirnames, filenames in os.walk(filepath):
            for filename in fnmatch.filter(filenames, pattern):
                # matches.append(os.path.join(root, filename))  # full path
                matches.append(os.path.join(filename))  # just file name
        if matches:
            print("Found {} files:".format(len(matches)))
            output_files(matches)
        else:
            print("No files found.")
    else:
        print("Sorry that path does not exist. Try again.")


def output_files(list_of_files):
    for filename in list_of_files:
        print(filename)


if __name__ == '__main__':
    get_file_names(PATH, PATTERN)

optimize_images_with_wand

import fnmatch
import os

# pip install Wand
from wand.image import Image
# pip install http://pypi.python.org/packages/source/h/hurry.filesize/hurry.filesize-0.9.tar.gz
from hurry.filesize import size


# constants
PATH = '/../../../..'
PATTERN = '*.jpg'


def get_image_file_names(filepath, pattern):
    matches = []
    if os.path.exists(filepath):
        for root, dirnames, filenames in os.walk(filepath):
            for filename in fnmatch.filter(filenames, pattern):
                matches.append(os.path.join(root, filename))  # full path
        if matches:
            print("Found {} files, with a total file size of {}.".format(
                len(matches), get_total_size(matches)))
            return matches
        else:
            print("No files found.")
    else:
        print("Sorry that path does not exist. Try again.")


def get_total_size(list_of_image_names):
    total_size = 0
    for image_name in list_of_image_names:
        total_size += os.path.getsize(image_name)
    return size(total_size)


def resize_images(list_of_image_names):
    print("Optimizing ... ")
    for index, image_name in enumerate(list_of_image_names):
        with open(image_name) as f:
            image_binary = f.read()
        with Image(blob=image_binary) as img:
            if img.height >= 600:
                img.transform(resize='x600')
                img.save(filename=image_name)
    print("Optimization complete.")


if __name__ == '__main__':
    all_images = get_image_file_names(PATH, PATTERN)
    resize_images(all_images)
    get_image_file_names(PATH, PATTERN)

csv_split


import sys
import os
import csv
import argparse

"""
Splits a CSV file into multiple files based on command line arguments.
    Arguments:
    `-h`: help file of usage of the script
    `-i`: input file name
    `-o`: output file name
    `-r`: row limit to split
    Default settings:
    `output_path` is the current directory
    headers are displayed on each split file
    the default delimeter is a comma
    Example usage:
    # split csv by every 100 rows
    >> python csv_split.py -i input.csv -o output -r 10
"""


def get_arguments():
    """Grab user supplied arguments using the argparse library."""

    # Use arparse to get command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input_file", required=True,
                        help="csv input file (with extension)", type=str)
    parser.add_argument("-o", "--output_file", required=True,
                        help="csv output file (without extension)", type=str)
    parser.add_argument("-r", "--row_limit", required=True,
                        help="row limit to split csv at", type=int)
    args = parser.parse_args()

    # Check if the input_file exits
    is_valid_file(parser, args.input_file)

    # Check if the input_file is valid
    is_valid_csv(parser, args.input_file, args.row_limit)

    return args.input_file, args.output_file, args.row_limit


def is_valid_file(parser, file_name):
    """Ensure that the input_file exists."""
    if not os.path.exists(file_name):
        parser.error("The file '{}' does not exist!".format(file_name))
        sys.exit(1)


def is_valid_csv(parser, file_name, row_limit):
    """
    Ensure that the # of rows in the input_file
    is greater than the row_limit.
    """
    row_count = 0
    for row in csv.reader(open(file_name)):
        row_count += 1
    # Note: You could also use a generator expression
    # and the sum() function to count the rows:
    # row_count = sum(1 for row in csv.reader(open(file_name)))
    if row_limit > row_count:
        parser.error(
            "The 'row_count' of '{}' is > the number of rows in '{}'!"
            .format(row_limit, file_name)
        )
        sys.exit(1)


def parse_file(arguments):
    """
    Splits the CSV into multiple files or chunks based on the row_limit.
    Then create new CSV files.
    """
    input_file = arguments[0]
    output_file = arguments[1]
    row_limit = arguments[2]
    output_path = '.'  # Current directory

    # Read CSV, split into list of lists
    with open(input_file, 'r') as input_csv:
        datareader = csv.reader(input_csv)
        all_rows = []
        for row in datareader:
            all_rows.append(row)

        # Remove header
        header = all_rows.pop(0)

        # Split list of list into chunks
        current_chunk = 1
        for i in range(0, len(all_rows), row_limit):  # Loop through list
            chunk = all_rows[i:i + row_limit]  # Create single chunk

            current_output = os.path.join(  # Create new output file
                output_path,
                "{}-{}.csv".format(output_file, current_chunk)
            )

            # Add header
            chunk.insert(0, header)

            # Write chunk to output file
            with open(current_output, 'w') as output_csv:
                writer = csv.writer(output_csv)
                writer = writer.writerows(chunk)

            # Output info
            print("")
            print("Chunk # {}:".format(current_chunk))
            print("Filepath: {}".format(current_output))
            print("# of rows: {}".format(len(chunk)))

            # Create new chunk
            current_chunk += 1


if __name__ == "__main__":
    arguments = get_arguments()
    parse_file(arguments)

random_name_generator

from random import randint


def random_name_generator(first, second, x):
    """
        Generates random names.
        Arguments:
         - list of first names
         - list of last names
         - number of random names
    """
    names = []
    for i in range(0, int(x)):
        random_first = randint(0, len(first)-1)
        random_last = randint(0, len(second)-1)
        names.append("{0} {1}".format(
            first[random_first],
            second[random_last])
        )
    return set(names)


first_names = ["Drew", "Mike", "Landon", "Jeremy", "Tyler", "Tom", "Avery"]
last_names = ["Smith", "Jones", "Brighton", "Taylor"]
names = random_name_generator(first_names, last_names, 5)
print('
'.join(names))

html_to_markdown

# Convert all html files in a single directory to markdown
#
# 1. Install pandoc
# 2. Run the script
 
 
 
FILES=*.html
for f in $FILES
do
  # extension="${f##*.}"
  filename="${f%.*}"
  echo "Converting $f to $filename.md"
  `pandoc $f -t markdown -o ../mds/$filename.md`
  # uncomment this line to delete the source file.
  # rm $f
done

check_my_environment

"""
Pass in a config file based on your environment.
Example:
import check_my_environment
class Main:
    def __init__(self, configFile):
        pass
    def process(self):
        print("ok")
if __name__ == "__main__":
    m = Main(some_script.CONFIGFILE)
    m.process()
"""


import os
import sys
ENVIRONMENT = "development"
CONFIGFILE = None


def get_config_file():
    directory = os.path.dirname(__file__)
    return {
        "development": "{}/../config/development.cfg".format(directory),
        "staging": "{}/../config/staging.cfg".format(directory),
        "production": "{}/../config/production.cfg".format(directory)
    }.get(ENVIRONMENT, None)

CONFIGFILE = get_config_file()

if CONFIGFILE is None:
    sys.exit("Configuration error! Unknown environment set. 
              Edit config.py and set appropriate environment")
print("Config file: {}".format(CONFIGFILE))
if not os.path.exists(CONFIGFILE):
    sys.exit("Configuration error! Config file does not exist")
print("Config ok ....")

jinja_quick_load

"""
Render a quick Jinja2 template.
Thanks Danny - http://pydanny.com/jinja2-quick-load-function.html
Example:
>>> from jinja_quick_load import render_from_template
>>> data = {
...     "date": "June 12, 2014",
...     "items": ["oranges", "bananas", "steak", "milk"]
... }
>>> render_from_template(".", "shopping_list.html", **data)
"""


from jinja2 import FileSystemLoader, Environment


def render_from_template(directory, template_name, **kwargs):
    loader = FileSystemLoader(directory)
    env = Environment(loader=loader)
    template = env.get_template(template_name)
    return template.render(**kwargs)

rewrite_git_history

I always forget how to back date, so here we go ...

This is dangerous and should be signed off by the omniscience, omnipotence Git him/herself. Rewriting history is evil, in other words.

$ git add <file_name>
$ export GIT_COMMITER_DATE="Sun Jun 15 14:00 2014 +0100"
$ export GIT_AUTHOR_DATE="Sun Jun 15 14:00 2014 +0100"
$ git commit -m "so bad"
$ git push

GIT_COMMITER_DATE and GIT_AUTHOR_DATE are environment variables

zipper

import os
from datetime import datetime
from zipfile import ZipFile


# set file name and time of creation
today = datetime.now()
file_name = 'zipper_' + today.strftime('%Y.%m.%dh%H%M') + '.zip'
dir_name = 'tmp/'  # update path


def zipdir(path, zip):
    for root, dirs, files in os.walk(path):
        for file in files:
            zip.write(os.path.join(root, file))

if __name__ == '__main__':
    zipfile = ZipFile(file_name, 'w')
    zipdir(dir_name, zipfile)
    zipfile.close()

查找当前文件下面所有文件中是否包含某个字符

find . -name "*.py"| xargs  cat | grep org.csv

linux 挂载samba同步文件

#sudo apt-get install cifs-utils
sudo mount -t cifs //IP/share $(pwd)/share -o username=username

//192.168.3.145/username /home/username/dev/ cifs defaults,username=username,password=password,uid=uid,gid=gid

mount -t cifs //60.205.230.226/share $(pwd)/share -o username=xxxxxxxx,passwd=xxxxxxx

sudo mount -t cifs //192.168.0.103/Public /mnt/samba/ -o guest

mount -t smbfs -o codepage=cp936,username=用户名,password=密码 , -l //ip地址/共享文件夹名 挂载点
或
mount -t smbfs -o codepage=cp936,username=用户名,password=密码 , -l //计算机名/共享文件夹名 挂载点
  若没有设置用户名和密码，则可以简化为:
mount -t smbfs -o codepage=cp936 //ip地址或计算机名/共享文件夹名 挂载点

根据端口号查看进程

lsof -Pnl +M -i4 | grep 8010
Linux下查看端口号所使用的进程号：
使用lsof命令： lsof –i:端口号

ubuntu:查看进程占用端口号

sudo netstat -anp|grep pid

Linux登录ssh携带密码

sudo apt-get install sshpass
sshpass -p '12345678' ssh androidyue@10.0.5.10
　　注意上述必须要有单引号，不能为双引号。

非交互式的sudo执行

echo password | sudo -S ls

远程主机的ssh操作

ssh centos@192.168.202.205 << AAA
    ls
    exit
AAA