o
    vhRl                     @  sF  d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
ZddlmZ ddlmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZ ddlZddlmZmZmZ ddl m!Z! ddl"m#  m$  m%Z& ddl'm(Z( e	r~ddl)m*Z*m+Z+m,Z, edZ-edZ.d!ddZ/d"ddZ0G dd dZ1G dd  d e(ej2Z3dS )#a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )annotations)abc)datetime	timedeltaN)TYPE_CHECKING)
get_option)read_double_with_byteswapread_float_with_byteswapread_uint16_with_byteswapread_uint32_with_byteswapread_uint64_with_byteswap)Parserget_subheader_index)cast_from_unit_vectorized)EmptyDataError)	DataFrame	Timestampisna)
get_handle)
ReaderBase)CompressionOptionsFilePath
ReadBufferz
1970-01-01z
1960-01-01sas_datetimefloatunitstrc                 C  sR   t | rtjS |dkrtdddt| d S |dkr%tdddt| d S td)Ns     )secondsd)dayszunit must be 'd' or 's')r   pdNaTr   r   
ValueError)r   r    r&   T/var/www/html/hyperkenya/venv/lib/python3.10/site-packages/pandas/io/sas/sas7bdat.py_parse_datetimeC   s   r(   sas_datetimes	pd.Seriesreturnc                 C  sl   t t d}|dkr#t| jddd}|d| }tj|| jddS t	j
| dd| }tj|d	| jdd
S )a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {'d', 's'}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    r   ms)r   out_unitzM8[ms]FindexcopyzM8[D]dtypezM8[s]r2   r/   r0   )_sas_origin_unix_originas_unitr   _valuesviewr#   Seriesr/   nparray)r)   r   tdmillisdt64msvalsr&   r&   r'   _convert_datetimesQ   s   r@   c                   @  sH   e Zd ZU ded< ded< ded< ded< ded< ded	< dddZdS )_Columnintcol_idstr | bytesnamelabelformatbytesctypelengthr+   Nonec                 C  s(   || _ || _|| _|| _|| _|| _d S N)rC   rE   rF   rG   rI   rJ   )selfrC   rE   rF   rG   rI   rJ   r&   r&   r'   __init__w   s   

z_Column.__init__N)rC   rB   rE   rD   rF   rD   rG   rD   rI   rH   rJ   rB   r+   rK   )__name__
__module____qualname____annotations__rN   r&   r&   r&   r'   rA   o   s   
 rA   c                   @  sJ  e Zd ZU dZded< ded< 								dYdZddZd[ddZd[ddZd[dd Zd\d!d"Z	d\d#d$Z
d]d&d'Zd^d*d+Zd_d,d-Zd`d/d0Zdad2d3Zd\d4d5Zdbd6d7Zd\d8d9Zd\d:d;Zdcd<d=Zdcd>d?Zdcd@dAZdcdBdCZdcdDdEZdcdFdGZdcdHdIZdcdJdKZdddedMdNZdOdP Zd]dQdRZdSdT ZdfdWdXZ dS )gSAS7BDATReadera  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : bool, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : bool, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : str, 'infer', defaults to None
        String encoding acc. to Python standard encodings,
        encoding='infer' tries to detect the encoding from the file header,
        encoding=None will leave the data in binary format.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    rB   _int_lengthzbytes | None_cached_pageNTinferpath_or_bufFilePath | ReadBuffer[bytes]convert_datesboolblank_missing	chunksize
int | Noneencoding
str | Noneconvert_textconvert_header_textcompressionr   r+   rK   c
           
   	   C  s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|dd|	d| _| jj| _| j| j| j| j| j| j| j| jd g	| _ z| !  | "  W d S  t#yx   | $   w )Nzlatin-1    r   rbF)is_textrb   )%r/   rY   r[   r\   r^   r`   ra   default_encodingrb   column_names_rawcolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointersrU   _column_data_lengths_column_data_offsets_column_types_current_row_in_file_index_current_row_on_page_indexr   handleshandle_path_or_buf_process_rowsize_subheader_process_columnsize_subheader_process_subheader_counts_process_columntext_subheader_process_columnname_subheader#_process_columnattributes_subheader_process_format_subheader_process_columnlist_subheader_subheader_processors_get_properties_parse_metadata	Exceptionclose)
rM   rW   r/   rY   r[   r\   r^   r`   ra   rb   r&   r&   r'   rN      sT   
zSAS7BDATReader.__init__
np.ndarrayc                 C     t j| jt jdS )z5Return a numpy int64 array of the column data lengthsr1   )r:   asarrayrl   int64rM   r&   r&   r'   column_data_lengths      z"SAS7BDATReader.column_data_lengthsc                 C  r   )z0Return a numpy int64 array of the column offsetsr1   )r:   r   rm   r   r   r&   r&   r'   column_data_offsets   r   z"SAS7BDATReader.column_data_offsetsc                 C  s   t j| jt ddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        S1r1   )r:   r   rn   r2   r   r&   r&   r'   column_types   s   zSAS7BDATReader.column_typesc                 C  s   | j   d S rL   )rq   r   r   r&   r&   r'   r      s   zSAS7BDATReader.closec                 C  s  | j d | j d| _| jdttj tjkrtd| tj	tj
}|tjkr:d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| tjtj}|tjkrYtj}nd}| tjtj}|dkrqd	| _tjd
k| _n	d| _tjdk| _| tjtjd }|tj v rtj | | _!| j"dkr| j!| _"nd| d| _!t#ddd}| $tj%| tj&}|t'j(|dd | _)| $tj*| tj+}|t'j(|dd | _,| -tj.| tj/| _0| j | j0d }|  j|7  _t| j| j0krtd| -tj1| tj2| _3d S )Nr   i   z'magic number mismatch (not a SAS file?)T   F      <big>littlerV   zunknown (code=)r   r   r   )r   z*The SAS7BDAT file appears to be truncated.)4rs   seekreadrU   lenconstmagicr%   _read_bytesalign_1_offsetalign_1_lengthu64_byte_checker_valueU64rT   page_bit_offset_x64_page_bit_offsetsubheader_pointer_length_x64_subheader_pointer_lengthpage_bit_offset_x86subheader_pointer_length_x86align_2_offsetalign_2_lengthalign_1_checker_valuealign_2_valueendianness_offsetendianness_length
byte_ordersys	byteorderneed_byteswapencoding_offsetencoding_lengthencoding_namesinferred_encodingr^   r   _read_floatdate_created_offsetdate_created_lengthr#   to_timedeltadate_createddate_modified_offsetdate_modified_lengthdate_modified
_read_uintheader_size_offsetheader_size_lengthheader_lengthpage_size_offsetpage_size_length_page_length)rM   bufalign1epochxr&   r&   r'   r}      sd   





zSAS7BDATReader._get_propertiesr   c                 C  s(   | j | jpdd}|jr|   t|S )Nr   )nrows)r   r\   emptyr   StopIteration)rM   dar&   r&   r'   __next__C  s
   zSAS7BDATReader.__next__offsetwidthc                 C  sN   | j d usJ |dkrt| j || jS |dkrt| j || jS |   td)Nr   r   zinvalid float width)rU   r	   r   r   r   r%   rM   r   r   r&   r&   r'   r   K  s   

zSAS7BDATReader._read_floatc                 C  s~   | j d usJ |dkr| |dd S |dkrt| j || jS |dkr+t| j || jS |dkr7t| j || jS |   td)Nr   r      r   r   zinvalid int width)rU   r   r
   r   r   r   r   r%   r   r&   r&   r'   r   Z  s"   


zSAS7BDATReader._read_uintrJ   c                 C  sB   | j d usJ || t| j kr|   td| j |||  S )NzThe cached page is too small.)rU   r   r   r%   rM   r   rJ   r&   r&   r'   r   n  s
   zSAS7BDATReader._read_bytesrD   c                 C  s   |  | ||dS )N     )_convert_header_textr   rstripr   r&   r&   r'   _read_and_convert_header_textu  s   z,SAS7BDATReader._read_and_convert_header_textc                 C  sV   d}|s)| j | j| _t| jdkrd S t| j| jkr!td|  }|rd S d S )NFr   z2Failed to read a meta data page from the SAS file.)rs   r   r   rU   r   r%   _process_page_meta)rM   doner&   r&   r'   r~   z  s   zSAS7BDATReader._parse_metadatac                 C  sZ   |    tjtjtjg }| j|v r|   | jtjk}| jtjk}t|p+|p+| j	g kS rL   )
_read_page_headerr   page_meta_typespage_amd_typepage_mix_type_current_page_type_process_page_metadatapage_data_typerZ   rk   )rM   ptis_data_pageis_mix_pager&   r&   r'   r     s   
z!SAS7BDATReader._process_page_metac                 C  s^   | j }tj| }| |tjtj@ | _tj| }| |tj| _	tj
| }| |tj| _d S rL   )r   r   page_type_offsetr   page_type_lengthpage_type_mask2r   block_count_offsetblock_count_length_current_page_block_countsubheader_count_offsetsubheader_count_length_current_page_subheaders_count)rM   
bit_offsettxr&   r&   r'   r     s   



z SAS7BDATReader._read_page_headerc                 C  s  | j }t| jD ]}tj| }|| j|  }| || j}|| j7 }| || j}|| j7 }| |d}|d7 }| |d}|dksG|tjkrHq| 	|| j}	t
|	}
| j|
 }|d u r|tjdfv }|tjk}| jrx|rx|rx| j||f q|   td|	 ||| qd S )Nr   r   zUnknown subheader signature )r   ranger   r   subheader_pointers_offsetr   r   rT   truncated_subheader_idr   r   r|   compressed_subheader_idcompressed_subheader_typerb   rk   appendr   r%   )rM   r   ir   total_offsetsubheader_offsetsubheader_lengthsubheader_compressionsubheader_typesubheader_signaturesubheader_indexsubheader_processorf1f2r&   r&   r'   r     s<   





z%SAS7BDATReader._process_page_metadatac                 C  s   | j }|}|}| jr|d7 }|d7 }n|d7 }|d7 }| |tj|  || _| |tj|  || _| |tj|  || _	| |tj
|  || _tj| }| || || _| |d| _| |d| _d S )Ni  i  ib  iz  r   )rT   r   r   r   row_length_offset_multiplier
row_lengthrow_count_offset_multiplier	row_countcol_count_p1_multipliercol_count_p1col_count_p2_multipliercol_count_p2'row_count_on_mix_page_offset_multiplier_mix_page_row_count_lcs_lcp)rM   r   rJ   int_len
lcs_offset
lcp_offsetmxr&   r&   r'   rt     s4   

z)SAS7BDATReader._process_rowsize_subheaderc                 C  sX   | j }||7 }| ||| _| j| j | jkr*td| j d| j d| j d d S d S )Nz Warning: column count mismatch (z + z != z)
)rT   r   column_countr   r  print)rM   r   rJ   r  r&   r&   r'   ru     s   
z,SAS7BDATReader._process_columnsize_subheaderc                 C     d S rL   r&   r   r&   r&   r'   rv     s   z(SAS7BDATReader._process_subheader_countsc           	      C  s  || j 7 }| |tj}| ||}|d| d}| j| t| jdkrd}tj	D ]}||v r5|}q-|| _
|| j 8 }|d }| jrI|d7 }| || j}|d}|dkrwd| _|d }| jrg|d7 }| || j}|d| j | _nB|tjkr|d	 }| jr|d7 }| || j}|d| j | _n"| jdkrd| _|d }| jr|d7 }| || j}|d| j | _t| d
r| | j| _d S d S d S )Nr   r   r   rc      r           (   creator_proc)rT   r   r   text_block_size_lengthr   r   rg   r   r   compression_literalsrb   r   r  r  r  rle_compressionhasattrr   )	rM   r   rJ   text_block_sizer   	cname_rawcompression_literalcloffset1r&   r&   r'   rw     sT   






"z,SAS7BDATReader._process_columntext_subheaderc                 C  s   | j }||7 }|d|  d d }t|D ]Q}|tj|d   tj }|tj|d   tj }|tj|d   tj }| |tj}	| |tj	}
| |tj
}| j|	 }||
|
|  }| j| | qd S )Nr      r   r   )rT   r   r   column_name_pointer_length!column_name_text_subheader_offsetcolumn_name_offset_offsetcolumn_name_length_offsetr   !column_name_text_subheader_lengthcolumn_name_offset_lengthcolumn_name_length_lengthrg   rh   r   r   )rM   r   rJ   r  column_name_pointers_countr   text_subheadercol_name_offsetcol_name_lengthidx
col_offsetcol_lenname_rawcnamer&   r&   r'   rx   "  sB   
z,SAS7BDATReader._process_columnname_subheaderc           
      C  s   | j }|d|  d |d  }t|D ]Y}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| ||}	| j|	 | |tj	}	| j
|	 | |tj}	| j|	dkridnd qd S )Nr   r  r   r      d   s)rT   r   r   column_data_offset_offsetcolumn_data_length_offsetcolumn_type_offsetr   rm   r   column_data_length_lengthrl   column_type_lengthrn   )
rM   r   rJ   r  column_attributes_vectors_countr   col_data_offsetcol_data_len	col_typesr   r&   r&   r'   ry   C  s,   
z2SAS7BDATReader._process_columnattributes_subheaderc                 C  r  rL   r&   r   r&   r&   r'   r{   ]  s   z,SAS7BDATReader._process_columnlist_subheaderc                 C  sx  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| |tj	}
t
|
t| jd }| |tj}| |tj}| |tj}t
|t| jd }| |tj}| |	tj}| j| }| ||||  }| j| }| ||||  }t| j}t|| j| ||| j| | j| }| j| | j| d S )N   r   )rT   r   )column_format_text_subheader_index_offsetcolumn_format_offset_offsetcolumn_format_length_offset(column_label_text_subheader_index_offsetcolumn_label_offset_offsetcolumn_label_length_offsetr   )column_format_text_subheader_index_lengthminr   rg   column_format_offset_lengthcolumn_format_length_length(column_label_text_subheader_index_lengthcolumn_label_offset_lengthcolumn_label_length_lengthr   rj   rA   rh   rn   rl   ri   r   )rM   r   rJ   r  text_subheader_formatcol_format_offsetcol_format_lentext_subheader_labelcol_label_offsetcol_label_lenr   
format_idxformat_start
format_len	label_idxlabel_start	label_lenlabel_namescolumn_labelformat_namescolumn_formatcurrent_column_numbercolr&   r&   r'   rz   a  sX   


	z(SAS7BDATReader._process_format_subheaderr   c                 C  s   |d u r| j d ur| j }n|d u r| j}t| jdkr#|   td|dkr0| j| jkr0t S t|| j| j }| j	d}| j	d}t
j||ftd| _t
j|d| ft
jd| _d| _t| }|| |  }| jd urw|| j}|S )Nr   zNo columns to parse from filer,  r-  r1   r   )r\   r   r   rn   r   r   ro   r   r?  countr:   r   object_string_chunkzerosuint8_byte_chunk_current_row_in_chunk_indexr   r   _chunk_to_dataframer/   	set_index)rM   r   ndnsprsltr&   r&   r'   r     s*   

zSAS7BDATReader.readc                 C  s   g | _ | j| j| _t| jdkrdS t| j| jkr3|   dt| jdd| jdd}t||   | j	t
jv rA|   | j	t
jt
jt
jg vrQ|  S dS )Nr   Tz-failed to read complete page from file (read r!   z of z bytes)F)rk   rs   r   r   rU   r   r   r%   r   r   r   r   r   r   r   _read_next_page)rM   msgr&   r&   r'   rd    s,   
zSAS7BDATReader._read_next_pagec                 C  s  | j }| j}t|| |}i }d\}}td}t| jD ]}| j| }	| j| dkrq| j|d d f j| j	d d}
t
j|
tj|dd||	< | jrl| j| tjv r[t||	 d||	< n| j| tjv rlt||	 d||	< |d	7 }q| j| d
krt
j| j|d d f |dd||	< | jr| jd ur| ||	 j||	< |r||	 d||	< |d	7 }q|   tdt| j|  t|| j|dd}|S )N)r   r   zfuture.infer_stringr,  r!   r1   Fr3   r   r   r-  r.   r   zunknown column type )rj   r/   r0   )r]  ro   r   r   r
  rh   rn   r\  r8   r   r#   r9   r:   float64rY   ri   r   sas_date_formatsr@   sas_datetime_formatsrY  r`   r^   _decode_stringr   astyper   r%   reprr   )rM   nmixrc  jsjbinfer_stringjrE   col_arrdfr&   r&   r'   r^    s8   
 
"
z"SAS7BDATReader._chunk_to_dataframec                 C  s   | | jp| jS rL   )decoder^   rf   rM   br&   r&   r'   ri    s   zSAS7BDATReader._decode_stringrw  rH   c                 C  s   | j r| |S |S rL   )ra   ri  rv  r&   r&   r'   r     s   
z#SAS7BDATReader._convert_header_text)NTTNNTTrV   )rW   rX   rY   rZ   r[   rZ   r\   r]   r^   r_   r`   rZ   ra   rZ   rb   r   r+   rK   )r+   r   )r+   rK   )r+   r   )r   rB   r   rB   )r   rB   r   rB   r+   rB   )r   rB   rJ   rB   )r   rB   rJ   rB   r+   rD   )r+   rZ   )r   rB   rJ   rB   r+   rK   rL   )r   r]   r+   r   )rw  rH   r+   rD   )!rO   rP   rQ   __doc__rR   rN   r   r   r   r   r}   r   r   r   r   r   r~   r   r   r   rt   ru   rv   rw   rx   ry   r{   rz   r   rd  r^  ri  r   r&   r&   r&   r'   rS      sP   
 
?




F









+



-
!

7
#rS   )r   r   r   r   )r)   r*   r   r   r+   r*   )4rx  
__future__r   collectionsr   r   r   r   typingr   numpyr:   pandas._configr   pandas._libs.byteswapr   r	   r
   r   r   pandas._libs.sasr   r   pandas._libs.tslibs.conversionr   pandas.errorsr   pandasr#   r   r   r   pandas.io.commonr   pandas.io.sas.sas_constantsiosassas_constantsr   pandas.io.sas.sasreaderr   pandas._typingr   r   r   r5   r4   r(   r@   rA   IteratorrS   r&   r&   r&   r'   <module>   s2    

