Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
Last revision Both sides next revision
software_development:python_pandas [2022/08/04 14:39]
prgram
software_development:python_pandas [2023/05/16 15:09]
prgram [encoding_errors - 'ignore']
Line 2: Line 2:
 {{INLINETOC}} {{INLINETOC}}
  
-=== Shapes ​===+=== etc : list ===
 <code python> <code python>
-df.shape +set[list] ​# unique value 
-df.describe() +[list].sort() #​자동적용?​ 
-df.info()+[list1] + [list2] ​ #​list합치기
 </​code>​ </​code>​
  
-#use the dataframe.nunique() function to find the unique values +===== shape of df ===== 
-unique_counts ​pd.DataFrame.from_records([(col,​ df[col].nunique()) for col in df.columns],​columns=['​Column_Name',​ '​Num_Unique'​]).sort_values(by=['​Num_Unique'​]) +=== Pivot_table ​===
- +
-df.loc[df.sex=='​W',​ '​year_born'​].hist() +
- +
-=== Selects ​===+
 <code python> <code python>
-iloc: Select by position +df.pivot_table(index=[인덱스컬럼],​ 
-loc: Select by label+               columns=[컬럼1,​컬럼2],​ 
 +               ​values=[값],​ 
 +               ​aggfunc='​sum'​).reset_index() 
 +</​code>​ 
 +  * string일 때는 aggfunc='​max'​ 
 +  * index에 NULL 있으면 안됨 
 +== fillna == 
 +<code python>​ 
 +df[['​a','​b','​c'​]] = df[['​a','​b','​c'​]].fillna(''​)
 </​code>​ </​code>​
  
 +=== group by ===
 +<code python>
 +df.groupby([컬럼들]).agg({'​컬럼':​sum}).reset_index()
  
-   +df.groupby([COLUMNS])['​COLUMN'​].max().reset_index()
-   +
-==I/O file==  ​+
  
 +df = df.assign(date=pd.to_numeric(df['​date'​],​ errors='​coerce'​)).groupby(['​코드',​ '​종목명'​]).agg({'​date':​np.min}).reset_index().drop_duplicates()
  
-=== chunk === +df df[['​코드'​'​date'​]].groupby(['​코드'​]).agg({'​date':​ [np.minnp.max]}).reset_index(level='​종목코드'​
-<code python>​ +df.columns = df.columns.droplevel()
-for indexchunk in enumeratepd.read_csv(filenamechunksize=chunk_size,​ dtype=str), start=1): +
-  chunk.memory_usage()+
 </​code>​ </​code>​
  
 +=== rank ===
 <code python> <code python>
-df_chunk ​pd.read_csv(r'../​input/​data.csv', chunksize=1000000# chink : # of lines +df['​rank'​] ​df.groupby('code')['​value'​].rank(ascending=False)
-for chunk in df_chunk: ​  +
-    # perform data filtering  +
-    chunk_filter = chunk_preprocessing(chunk)+
 </​code>​ </​code>​
  
 +=== merge ===
 <code python> <code python>
-df3=pd.read_csv("​abc.txt"​skiprows=1header=None, +df_out ​df_out.merge(dfon=['​no'​'​name'​]how='​outer'​) #left_on right_on
-               sep="​|",​ nrows=100, encoding="​utf-16"​) #, error_bad_lines=False) +
- +
-with open("​abc.txt",​ encoding="​utf-16"​) as f: +
-    i=0 +
-    for line in f: +
-        i=i+1 +
-        print(line) +
-        if i >3 : break +
-</​code>​ +
-           +
-           +
-file open/ append lines : https://​thispointer.com/​how-to-append-text-or-lines-to-a-file-in-python/ ​          +
-=== row iteration === +
-<code python>​ +
-for idx,row in anime[:​2].iterrows():​ +
-    print(idx, row)+
 </​code>​ </​code>​
  
  
 +===== modify =====
 === Series to DF === === Series to DF ===
 <code python> <code python>
 df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(),​ ignore_index=True) df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(),​ ignore_index=True)
 </​code>​ </​code>​
 +
 +
 === rename === === rename ===
 <code python> <code python>
Line 72: Line 62:
 df.columns = ['​1'​] + df.columns[1:​].tolist() df.columns = ['​1'​] + df.columns[1:​].tolist()
 </​code>​ </​code>​
 +
 +=== order of columns ===
 +<code python>
 +#1
 +df = df.sort_index(axis='​columns',​ level = '​MULTILEVEL INDEX NAME/​no'​)
 +#2
 +df.columns
 +col_order = ['​a','​b','​c'​]
 +df = df.reindex(col_order,​ axis='​columns'​)
 +</​code>​
 +
  
 === map === === map ===
Line 77: Line 78:
 df['​코드'​] = '​A'​ + df['​코드'​].map(lambda x: f'​{x:​0>​6}'​) ​ #6글자로 df['​코드'​] = '​A'​ + df['​코드'​].map(lambda x: f'​{x:​0>​6}'​) ​ #6글자로
 </​code>​ </​code>​
-=== Pivot_table ​===+ 
 + 
 +===== get info ===== 
 + 
 +=== Shapes ​===
 <code python> <code python>
-df.pivot_table(index=[인덱스컬럼],​ +df.shape 
-               columns=[컬럼1,​컬럼2],​ +df.describe() 
-               ​values=[값],​ +df.info()
-               ​aggfunc='​sum'​).reset_index()+
 </​code>​ </​code>​
  
-=== group by ===+#use the dataframe.nunique() function to find the unique values 
 +unique_counts ​pd.DataFrame.from_records([(col,​ df[col].nunique()) for col in df.columns],​columns=['​Column_Name',​ '​Num_Unique'​]).sort_values(by=['​Num_Unique'​]) 
 + 
 +df.loc[df.sex=='​W',​ '​year_born'​].hist() 
 + 
 +=== type ===
 <code python> <code python>
-df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index()+df['​value'​= pd.to_numeric(df['value'​],​ errors='​coerce') 
 +df['​value'​].dtype 
 +</​code>​
  
-df = df.assign(date=pd.to_numeric(df['​date'​],​ errors='​coerce'​)).groupby(['​코드',​ '​종목명'​]).agg({'​date':​np.min}).reset_index().drop_duplicates() 
  
-df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.minnp.max]}).reset_index(level='종목코드') +===== get element ===== 
-df.columns = df.columns.droplevel()+ 
 +=== Selects === 
 +<code python>​ 
 +iloc: Select by position 
 +loc: Select by label 
 +   
 +df.loc[:,​~df.columns.isin(['a','​b'])  
 + 
 +df[~df['a'].isin(['1','​2','​3'​]) & df['​b'​]=='3' )] #​row-wise 
 +df.loc[~( ​df['​a'​].isin(['​1','​2','​3'​]& df['​b'​]=='​3'​ ), 8] #​row-wise & column
 </​code>​ </​code>​
  
-=== merge ===+=== row iteration ​===
 <code python> <code python>
-df_out = df_out.merge(dfon=['​no',​ '​name'​], how='​outer'​)+for idx,row in anime[:2].iterrows():​ 
 +    print(idxrow)
 </​code>​ </​code>​
  
-=== rank ===+   
 +   
 +=====I/O file===== 
 + 
 +=== encoding_errors - '​ignore'​=== 
 +Encoding 제대로 했는데도 안되면.. 
 +공공데이터가 이런 경우가 많음. 
 <code python> <code python>
-df['rank'df.groupby('code')['value'].rank(ascending=False)+import chardet 
 +with open(file, ​'rb') as rawdata: 
 +    result ​chardet.detect(rawdata.read(100000)) 
 +result 
 + 
 + 
 +data = pd.read_csv( file, encoding='cp949',​ encoding_errors='​ignore') 
 +# on_bad_lines='skip' 
 +# error_bad_lines=False
 </​code>​ </​code>​
 +
 === to_numberic === === to_numberic ===
 <code python> <code python>
Line 120: Line 156:
 </​code>​ </​code>​
  
 +=== chunk ===
 +<code python>
 +for index, chunk in enumerate( pd.read_csv(filename,​ chunksize=chunk_size,​ dtype=str), start=1):
 +  chunk.memory_usage()
 +</​code>​
  
-=== type === 
 <code python> <code python>
-df['​value'​] ​= pd.to_numeric(df['value']errors='​coerce'​+df_chunk ​= pd.read_csv(r'../​input/​data.csv', ​chunksize=1000000# chink : # of lines 
-df['​value'​].dtype+for chunk in df_chunk: ​  
 +    # perform data filtering  
 +    chunk_filter = chunk_preprocessing(chunk)
 </​code>​ </​code>​
 +
 +<code python>
 +df3=pd.read_csv("​abc.txt",​ skiprows=1, header=None,​
 +               ​sep="​|",​ nrows=100, encoding="​utf-16"​) #, error_bad_lines=False)
 +
 +with open("​abc.txt",​ encoding="​utf-16"​) as f:
 +    i=0
 +    for line in f:
 +        i=i+1
 +        print(line)
 +        if i >3 : break
 +</​code>​
 +          ​
 +          ​
 +file open/ append lines : https://​thispointer.com/​how-to-append-text-or-lines-to-a-file-in-python/ ​         ​
 +
 +
 +
 +
 +
 +
 +
  
 {{tag>​software_development python 파이썬 pandas 데이터정리 data_analysis}} {{tag>​software_development python 파이썬 pandas 데이터정리 data_analysis}}