每个数据科学家都应该知道的12个Python特性!
来源:大数据应用 本文约5700字,建议阅读11分钟
本文我们将深入探讨每个数据科学家都应该了解的12个Python特性。
# list comprehension
_list = [x**2 for x in range(1, 11)]
# nested list comprehension to flatten list
matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
flat_list = [num
for row in matrix
for num in row]
print(_list)
print(flat_list)
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
[1, 2, 3, 4, 5, 6, 7, 8, 9]
# dictionary comprehension
_dict = {var:var ** 2 for var in range(1, 11) if var % 2 != 0}
# set comprehension
# create a set of squares of numbers from 1 to 10
_set = {x**2 for x in range(1, 11)}
# generator comprehension
_gen = (x**2 for x in range(1, 11))
print(_dict)
print(_set)
print(list(g for g in _gen))
{1: 1, 3: 9, 5: 25, 7: 49, 9: 81}
{64, 1, 4, 36, 100, 9, 16, 49, 81, 25}
[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]
for idx, value in enumerate(["a", "b", "c", "d"]):
if idx % 2 == 0:
print(value)
a
c
x = [1, 2, 3, 4]
y = [5, 6, 7, 8]
# iterate over both arrays simultaneously
for a, b in zip(x, y):
print(a, b, a + b, a * b)
1 5 6 5
2 6 8 12
3 7 10 21
4 8 12 32
def fib_gen(n):
a, b = 0, 1
for _ in range(n):
yield a
a, b = b, a + b
res = fib_gen(10)
print(list(r for r in res))
[0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
numbers = range(10)
even_numbers = list(filter(lambda x: x % 2 == 0, numbers))
print(even_numbers)
[0, 2, 4, 6, 8]
import pandas as pd
data = {
"sales_person": ["Alice", "Bob", "Charlie", "David"],
"sale_amount": [100, 200, 300, 400],
}
df = pd.DataFrame(data)
threshold = 250
df["above_threshold"] = df["sale_amount"].apply(
lambda x: True if x >= threshold else False
)
df
sales_person sale_amount above_threshold
0 Alice 100 False
1 Bob 200 False
2 Charlie 300 True
3 David 400 True
numbers = range(10)
# Use map(), filter(), and reduce() to preprocess and aggregate the list of numbers
even_numbers = filter(lambda x: x % 2 == 0, numbers)
squares = map(lambda x: x**2, even_numbers)
sum_of_squares = reduce(lambda x, y: x + y, squares)
print(f"Sum of the squares of even numbers: {sum_of_squares}")
Sum of the squares of even numbers: 120
data = [1, 3, 5, 7]
print(any(x % 2 == 0 for x in data))
print(all(x % 2 == 1 for x in data))
False
True
import random
def random_numbers():
while True:
yield random.random()
# Use next() to find the first number greater than 0.9
num = next(x for x in random_numbers() if x > 0.9)
print(f"First number greater than 0.9: {num}")
First number greater than 0.9: 0.9444805819267413
09 默认字典
defaultdict是内置类的子类dict,允许为缺失的键提供默认值。
defaultdict对于处理丢失或不完整的数据非常有用,例如在处理稀疏
矩阵或特征向量时。它还可用于计算分类变量的频率。
一个例子是计算列表中项目的出现次数。如果传入default_factory的参数为int,一开始初始化键对应的值都为0。
from collections import defaultdict
count = defaultdict(int)
for item in ['a', 'b', 'a', 'c', 'b', 'a']:
count[item] += 1
count
defaultdict(int, {'a': 3, 'b': 2, 'c': 1})
from functools import partial
def add(x, y):
return x + y
increment = partial(add, 1)
increment(1)
2
11 lru_cache
lru_cache是functools模块中的一个修饰函数,它允许使用有限大小的缓存来缓存函数的结果。
lru_cache对于优化计算成本较高的函数或可能使用相同参数多次调用的模型训练过程非常有用。
缓存可以帮助加快函数的执行速度并降低总体计算成本。
这是一个使用缓存有效计算Fibonacci numbers(https://en.wikipedia.org/wiki/Fibonacci_number)的示例(在计算机科学中称为记忆)
rom functools import lru_cache
@lru_cache(maxsize=None)
def fibonacci(n):
if n <= 1:
return n
return fibonacci(n - 1) + fibonacci(n - 2)
fibonacci(1e3)
4.346655768693743e+208
from dataclasses import dataclass
@dataclass
class Person:
name: str
age: int
city: str
p = Person("Alice", 30, "New York")
print(p)
Person(name='Alice', age=30, city='New York')
评论