ntfs-3g/libntfs/mft.c

/*
 * mft.c - Mft record handling code. Part of the Linux-NTFS project.
 *
 * Copyright (c) 2000-2004 Anton Altaparmakov
 *
 * This program/include file is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published
 * by the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program/include file is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program (in the main directory of the Linux-NTFS
 * distribution in the file COPYING); if not, write to the Free Software
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include "config.h"

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>

#include "compat.h"

#include "types.h"
#include "device.h"
#include "debug.h"
#include "bitmap.h"
#include "attrib.h"
#include "inode.h"
#include "volume.h"
#include "layout.h"
#include "mft.h"

/**
 * ntfs_mft_records_read - read records from the mft from disk
 * @vol:	volume to read from
 * @mref:	starting mft record number to read
 * @count:	number of mft records to read
 * @b:		output data buffer
 *
 * Read @count mft records starting at @mref from volume @vol into buffer
 * @b. Return 0 on success or -1 on error, with errno set to the error
 * code.
 *
 * If any of the records exceed the initialized size of the $MFT/$DATA
 * attribute, i.e. they cannot possibly be allocated mft records, assume this
 * is a bug and return error code ESPIPE.
 *
 * The read mft records are mst deprotected and are hence ready to use. The
 * caller should check each record with is_baad_record() in case mst
 * deprotection failed.
 *
 * NOTE: @b has to be at least of size @count * vol->mft_record_size.
 */
int ntfs_mft_records_read(const ntfs_volume *vol, const MFT_REF mref,
		const s64 count, MFT_RECORD *b)
{
	s64 br;
	VCN m;

	Dprintf("%s(): Entering for inode 0x%llx.\n", __FUNCTION__, MREF(mref));
	if (!vol || !vol->mft_na || !b || count < 0) {
		errno = EINVAL;
		return -1;
	}
	m = MREF(mref);
	/* Refuse to read non-allocated mft records. */
	if (m + count > vol->mft_na->initialized_size >>
			vol->mft_record_size_bits) {
		errno = ESPIPE;
		return -1;
	}
	br = ntfs_attr_mst_pread(vol->mft_na, m << vol->mft_record_size_bits,
			count, vol->mft_record_size, b);
	if (br != count) {
		if (br != -1)
			errno = EIO;
		if (br >= 0)
			Dputs("Error: partition is smaller than it should be!");
		else
			Dperror("Error reading $Mft record(s)");
		return -1;
	}
	return 0;
}

/**
 * ntfs_mft_records_write - write mft records to disk
 * @vol:	volume to write to
 * @mref:	starting mft record number to write
 * @count:	number of mft records to write
 * @b:		data buffer containing the mft records to write
 *
 * Write @count mft records starting at @mref from data buffer @b to volume
 * @vol. Return 0 on success or -1 on error, with errno set to the error code.
 *
 * If any of the records exceed the initialized size of the $MFT/$DATA
 * attribute, i.e. they cannot possibly be allocated mft records, assume this
 * is a bug and return error code ESPIPE.
 *
 * Before the mft records are written, they are mst protected. After the write,
 * they are deprotected again, thus resulting in an increase in the update
 * sequence number inside the data buffer @b.
 *
 * If any mft records are written which are also represented in the mft mirror
 * $MFTMirr, we make a copy of the relevant parts of the data buffer @b into a
 * temporary buffer before we do the actual write. Then if at least one mft
 * record was successfully written, we write the appropriate mft records from
 * the copied buffer to the mft mirror, too.
 */
int ntfs_mft_records_write(const ntfs_volume *vol, const MFT_REF mref,
		const s64 count, MFT_RECORD *b)
{
	s64 bw;
	VCN m;
	void *bmirr = NULL;
	int cnt = 0, res = 0;

	Dprintf("%s(): Entering for inode 0x%llx.\n", __FUNCTION__, MREF(mref));
	if (!vol || !vol->mft_na || vol->mftmirr_size <= 0 || !b || count < 0) {
		errno = EINVAL;
		return -1;
	}
	m = MREF(mref);
	/* Refuse to write non-allocated mft records. */
	if (m + count > vol->mft_na->initialized_size >>
			vol->mft_record_size_bits) {
		errno = ESPIPE;
		return -1;
	}
	if (m < vol->mftmirr_size) {
		if (!vol->mftmirr_na) {
			errno = EINVAL;
			return -1;
		}
		cnt = vol->mftmirr_size - m;
		if (cnt > count)
			cnt = count;
		bmirr = malloc(cnt * vol->mft_record_size);
		if (!bmirr)
			return -1;
		memcpy(bmirr, b, cnt * vol->mft_record_size);
	}
	bw = ntfs_attr_mst_pwrite(vol->mft_na, m << vol->mft_record_size_bits,
			count, vol->mft_record_size, b);
	if (bw != count) {
		if (bw != -1)
			errno = EIO;
		if (bw >= 0)
			Dputs("Error: partial write while writing $Mft "
					"record(s)!\n");
		else
			Dperror("Error writing $Mft record(s)");
		res = errno;
	}
	if (bmirr && bw > 0) {
		if (bw < cnt)
			cnt = bw;
		bw = ntfs_attr_mst_pwrite(vol->mftmirr_na,
				m << vol->mft_record_size_bits, cnt,
				vol->mft_record_size, bmirr);
		if (bw != cnt) {
			if (bw != -1)
				errno = EIO;
			Dputs("Error: failed to sync $MFTMirr! Run chkdsk.");
			res = errno;
		}
	}
	if (bmirr)
		free(bmirr);
	if (!res)
		return res;
	errno = res;
	return -1;
}

/**
 * ntfs_file_record_read - read a FILE record from the mft from disk
 * @vol:	volume to read from
 * @mref:	mft reference specifying mft record to read
 * @mrec:	address of pointer in which to return the mft record
 * @attr:	address of pointer in which to return the first attribute
 *
 * Read a FILE record from the mft of @vol from the storage medium. @mref
 * specifies the mft record to read, including the sequence number, which can
 * be 0 if no sequence number checking is to be performed.
 *
 * The function allocates a buffer large enough to hold the mft record and
 * reads the record into the buffer (mst deprotecting it in the process).
 * *@mrec is then set to point to the buffer.
 *
 * If @attr is not NULL, *@attr is set to point to the first attribute in the
 * mft record, i.e. *@attr is a pointer into *@mrec.
 *
 * Return 0 on success, or -1 on error, with errno set to the error code.
 *
 * The read mft record is checked for having the magic FILE,
 * and for having a matching sequence number (if MSEQNO(*@mref) != 0).
 * If either of these fails, -1 is returned and errno is set to EIO. If you get
 * this, but you still want to read the mft record (e.g. in order to correct
 * it), use ntfs_mft_record_read() directly.
 *
 * Note: Caller has to free *@mrec when finished.
 *
 * Note: We do not check if the mft record is flagged in use. The caller can
 *	 check if desired.
 */
int ntfs_file_record_read(const ntfs_volume *vol, const MFT_REF mref,
		MFT_RECORD **mrec, ATTR_RECORD **attr)
{
	MFT_RECORD *m;
	ATTR_RECORD *a;
	int err;

	if (!vol || !mrec) {
		errno = EINVAL;
		return -1;
	}
	m = *mrec;
	if (!m) {
		m = (MFT_RECORD*)malloc(vol->mft_record_size);
		if (!m)
			return -1;
	}
	if (ntfs_mft_record_read(vol, mref, m)) {
		err = errno;
		goto read_failed;
	}
	if (!ntfs_is_file_record(m->magic))
		goto file_corrupt;
	if (MSEQNO(mref) && MSEQNO(mref) != le16_to_cpu(m->sequence_number))
		goto file_corrupt;
	a = (ATTR_RECORD*)((char*)m + le16_to_cpu(m->attrs_offset));
	if (p2n(a) < p2n(m) || (char*)a > (char*)m + vol->mft_record_size)
		goto file_corrupt;
	*mrec = m;
	if (attr)
		*attr = a;
	return 0;
file_corrupt:
	Dputs("ntfs_file_record_read(): file is corrupt.");
	err = EIO;
read_failed:
	if (m != *mrec)
		free(m);
	errno = err;
	return -1;
}

/**
 * ntfs_mft_record_layout - layout an mft record into a memory buffer
 * @vol:	volume to which the mft record will belong
 * @mref:	mft reference specifying the mft record number
 * @m:		destination buffer of size >= @vol->mft_record_size bytes
 *
 * Layout an empty, unused mft record with the mft reference @mref into the
 * buffer @m.  The volume @vol is needed because the mft record structure was
 * modified in NTFS 3.1 so we need to know which volume version this mft record
 * will be used on.
 *
 * On success return 0 and on error return -1 with errno set to the error code.
 */
int ntfs_mft_record_layout(const ntfs_volume *vol, const MFT_REF mref,
		MFT_RECORD *m)
{
	ATTR_RECORD *a;

	if (!vol || !m) {
		errno = EINVAL;
		return -1;
	}
	/* Aligned to 2-byte boundary. */
	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
	else {
		/* Abort if mref is > 32 bits. */
		if (MREF(mref) & 0x0000ffff00000000ull) {
			Dputs("Mft reference exceeds 32 bits!");
			errno = ERANGE;
			return -1;
		}
		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
		/*
		 * Set the NTFS 3.1+ specific fields while we know that the
		 * volume version is 3.1+.
		 */
		m->reserved = cpu_to_le16(0);
		m->mft_record_number = cpu_to_le32(MREF(mref));
	}
	m->magic = magic_FILE;
	if (vol->mft_record_size >= NTFS_SECTOR_SIZE)
		m->usa_count = cpu_to_le16(vol->mft_record_size /
				NTFS_SECTOR_SIZE + 1);
	else {
		m->usa_count = cpu_to_le16(1);
		Dprintf("Sector size is bigger than MFT record size.  "
				"Setting usa_count to 1.  If Windows\nchkdsk "
				"reports this as corruption, please email "
				"linux-ntfs-dev@lists.sf.net\nstating that "
				"you saw this message and that the file "
				"system created was corrupt.\nThank you.\n");
	}
	/* Set the update sequence number to 1. */
	*(u16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
	m->lsn = cpu_to_le64(0ull);
	m->sequence_number = cpu_to_le16(1);
	m->link_count = cpu_to_le16(0);
	/* Aligned to 8-byte boundary. */
	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
	m->flags = cpu_to_le16(0);
	/*
	 * Using attrs_offset plus eight bytes (for the termination attribute),
	 * aligned to 8-byte boundary.
	 */
	m->bytes_in_use = cpu_to_le32((le16_to_cpu(m->attrs_offset) + 8 + 7) &
			~7);
	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
	m->base_mft_record = cpu_to_le64((MFT_REF)0);
	m->next_attr_instance = cpu_to_le16(0);
	a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
	a->type = AT_END;
	a->length = cpu_to_le32(0);
	/* Finally, clear the unused part of the mft record. */
	memset((u8*)a + 8, 0, vol->mft_record_size - ((u8*)a + 8 - (u8*)m));
	return 0;
}

/**
 * ntfs_mft_record_format - format an mft record on an ntfs volume
 * @vol:	volume on which to format the mft record
 * @mref:	mft reference specifying mft record to format
 *
 * Format the mft record with the mft reference @mref in $MFT/$DATA, i.e. lay
 * out an empty, unused mft record in memory and write it to the volume @vol.
 *
 * On success return 0 and on error return -1 with errno set to the error code.
 */
int ntfs_mft_record_format(const ntfs_volume *vol, const MFT_REF mref)
{
	MFT_RECORD *m;
	int err;

	if (!vol || !vol->mft_na) {
		errno = EINVAL;
		return -1;
	}
	m = malloc(vol->mft_record_size);
	if (!m)
		return -1;
	if (ntfs_mft_record_layout(vol, mref, m)) {
		err = errno;
		free(m);
		errno = err;
		return -1;
	}
	if (ntfs_mft_record_write(vol, mref, m)) {
		err = errno;
		free(m);
		errno = err;
		return -1;
	}
	free(m);
	return 0;
}

/**
 * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
 * @vol:	volume on which to allocate the mft record
 * @start:	starting mft record at which to allocate (or -1 if none)
 *
 * Allocate an mft record in $MFT/$DATA starting to search for a free record
 * at mft record number @start or at the current allocator position if @start
 * is -1, on the mounted ntfs volume @vol.
 *
 * On success return the now opened ntfs inode of the mft record.
 *
 * On error return NULL with errno set to the error code.
 *
 * To find a free mft record, we scan the mft bitmap for a zero bit.  To
 * optimize this we start scanning at the place specified by @start or if
 * @start is -1 we start where we last stopped and we perform wrap around when
 * we reach the end.  Note, we do not try to allocate mft records below number
 * 24 because numbers 0 to 15 are the defined system files anyway and 16 to 24
 * are special in that they are used for storing extension mft records for the
 * $DATA attribute of $MFT.  This is required to avoid the possibility of
 * creating a run list with a circular dependence which once written to disk
 * can never be read in again.  Windows will only use records 16 to 24 for
 * normal files if the volume is completely out of space.  We never use them
 * which means that when the volume is really out of space we cannot create any
 * more files while Windows can still create up to 8 small files.  We can start
 * doing this at some later time, it does not matter much for now.
 *
 * When scanning the mft bitmap, we only search up to the last allocated mft
 * record.  If there are no free records left in the range 24 to number of
 * allocated mft records, then we extend the $MFT/$DATA attribute in order to
 * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
 * records at a time or one cluster, if cluster size is above 16kiB.  If there
 * is not sufficient space to do this, we try to extend by a single mft record
 * or one cluster, if cluster size is above the mft record size, but we only do
 * this if there is enough free space, which we know from the values returned
 * by the failed cluster allocation function when we tried to do the first
 * allocation.
 *
 * No matter how many mft records we allocate, we initialize only the first
 * allocated mft record, incrementing mft data size and initialized size
 * accordingly, open an ntfs_inode for it and return it to the caller, unless
 * there are less than 24 mft records, in which case we allocate and initialize
 * mft records until we reach record 24 which we consider as the first free mft
 * record for use by normal files.
 *
 * If during any stage we overflow the initialized data in the mft bitmap, we
 * extend the initialized size (and data size) by 8 bytes, allocating another
 * cluster if required.  The bitmap data size has to be at least equal to the
 * number of mft records in the mft, but it can be bigger, in which case the
 * superflous bits are padded with zeroes.
 *
 * Thus, when we return successfully (return value non-zero), we will have:
 *	- initialized / extended the mft bitmap if necessary,
 *	- initialized / extended the mft data if necessary,
 *	- set the bit corresponding to the mft record being allocated in the
 *	  mft bitmap,
 *	- open an ntfs_inode for the allocated mft record, and we will
 *	- return the ntfs_inode.
 *
 * On error (return value zero), nothing will have changed.  If we had changed
 * anything before the error occured, we will have reverted back to the
 * starting state before returning to the caller.  Thus, except for bugs, we
 * should always leave the volume in a consistent state when returning from
 * this function.
 *
 * Note, this function cannot make use of most of the normal functions, like
 * for example for attribute resizing, etc, because when the run list overflows
 * the base mft record and an attribute list is used, it is very important that
 * the extension mft records used to store the $DATA attribute of $MFT can be
 * reached without having to read the information contained inside them, as
 * this would make it impossible to find them in the first place after the
 * volume is dismounted.  $MFT/$BITMAP probably does not need to follow this
 * rule because the bitmap is not essential for finding the mft records, but on
 * the other hand, handling the bitmap in this special way would make life
 * easier because otherwise there might be circular invocations of functions
 * when reading the bitmap but if we are careful, we should be able to avoid
 * all problems.
 */
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, s64 start)
{
	ntfs_debug("Entering (start 0x%llx).", (long long)start);
	if (!vol || !vol->mft_na || !vol->mftbmp_na || start < -1) {
		errno = EINVAL;
		return NULL;
	}
	errno = ENOTSUP;
	return NULL;
}

/**
 * ntfs_mft_record_free - free an mft record on an ntfs volume
 * @vol:	volume on which to free the mft record
 * @ni:		open ntfs inode of the mft record to free
 *
 * Free the mft record of the open inode @ni on the mounted ntfs volume @vol.
 * Note that this function calls ntfs_inode_close() internally and hence you
 * cannot use the pointer @ni any more after this function returns success.
 *
 * On success return 0 and on error return -1 with errno set to the error code.
 */
int ntfs_mft_record_free(ntfs_volume *vol, ntfs_inode *ni)
{
	u64 mft_no;
	int err;
	u16 seq_no, old_seq_no;

	if (!vol || !vol->mftbmp_na || !ni) {
		errno = EINVAL;
		return -1;
	}

	/* Cache the mft reference for later. */
	mft_no = ni->mft_no;

	/* Mark the mft record as not in use. */
	ni->mrec->flags &= ~MFT_RECORD_IN_USE;

	/* Increment the sequence number, skipping zero, if it is not zero. */
	old_seq_no = seq_no = le16_to_cpu(ni->mrec->sequence_number);
	if (seq_no == 0xffff)
		seq_no = 1;
	else if (seq_no)
		seq_no++;
	ni->mrec->sequence_number = cpu_to_le16(seq_no);

	/* Set the inode dirty and write it out. */
	ntfs_inode_mark_dirty(ni);
	if (ntfs_inode_sync(ni)) {
		err = errno;
		goto sync_rollback;
	}

	/* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
	if (ntfs_bitmap_clear_run(vol->mftbmp_na, mft_no, 1)) {
		err = errno;
		// FIXME: If ntfs_bitmap_clear_run() guarantees atomicity on
		//	  error, this could be changed to goto sync_rollback;
		goto bitmap_rollback;
	}

	/* Throw away the now freed inode. */
	if (!ntfs_inode_close(ni))
		return 0;
	err = errno;

	/* Rollback what we did... */
bitmap_rollback:
	if (ntfs_bitmap_set_run(vol->mftbmp_na, mft_no, 1))
		Dputs("Eeek! Rollback failed in ntfs_mft_record_free().  "
				"Leaving inconsistent metadata!");
sync_rollback:
	ni->mrec->flags |= MFT_RECORD_IN_USE;
	ni->mrec->sequence_number = cpu_to_le16(old_seq_no);
	ntfs_inode_mark_dirty(ni);
	errno = err;
	return -1;
}