ecfs/fast_commit.c at master · everschen/ecfs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// SPDX-License-Identifier: GPL-2.0

/*
 * fs/ecfs/fast_commit.c
 *
 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
 *
 * Ecfs fast commits routines.
 */
#include "ecfs.h"
#include "ecfs_jbd2.h"
#include "ecfs_extents.h"
#include "mballoc.h"

#include <linux/lockdep.h>
/*
 * Ecfs Fast Commits
 * -----------------
 *
 * Ecfs fast commits implement fine grained journalling for Ecfs.
 *
 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
 * struct ecfs_fc_tl). Each TLV contains some delta that is replayed TLV by
 * TLV during the recovery phase. For the scenarios for which we currently
 * don't have replay code, fast commit falls back to full commits.
 * Fast commits record delta in one of the following three categories.
 *
 * (A) Directory entry updates:
 *
 * - ECFS_FC_TAG_UNLINK		- records directory entry unlink
 * - ECFS_FC_TAG_LINK		- records directory entry link
 * - ECFS_FC_TAG_CREAT		- records inode and directory entry creation
 *
 * (B) File specific data range updates:
 *
 * - ECFS_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
 * - ECFS_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
 *
 * (C) Inode metadata (mtime / ctime etc):
 *
 * - ECFS_FC_TAG_INODE		- record the inode that should be replayed
 *				  during recovery. Note that iblocks field is
 *				  not replayed and instead derived during
 *				  replay.
 * Commit Operation
 * ----------------
 * With fast commits, we maintain all the directory entry operations in the
 * order in which they are issued in an in-memory queue. This queue is flushed
 * to disk during the commit operation. We also maintain a list of inodes
 * that need to be committed during a fast commit in another in memory queue of
 * inodes. During the commit operation, we commit in the following order:
 *
 * [1] Prepare all the inodes to write out their data by setting
 *     "ECFS_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
 *     deleted while it is being flushed.
 * [2] Flush data buffers to disk and clear "ECFS_STATE_FC_FLUSHING_DATA"
 *     state.
 * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
 *     all the exsiting handles finish and no new handles can start.
 * [4] Mark all the fast commit eligible inodes as undergoing fast commit
 *     by setting "ECFS_STATE_FC_COMMITTING" state.
 * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
 *     starting of new handles. If new handles try to start an update on
 *     any of the inodes that are being committed, ecfs_fc_track_inode()
 *     will block until those inodes have finished the fast commit.
 * [6] Commit all the directory entry updates in the fast commit space.
 * [7] Commit all the changed inodes in the fast commit space and clear
 *     "ECFS_STATE_FC_COMMITTING" for these inodes.
 * [8] Write tail tag (this tag ensures the atomicity, please read the following
 *     section for more details).
 *
 * All the inode updates must be enclosed within jbd2_jounrnal_start()
 * and jbd2_journal_stop() similar to JBD2 journaling.
 *
 * Fast Commit Ineligibility
 * -------------------------
 *
 * Not all operations are supported by fast commits today (e.g extended
 * attributes). Fast commit ineligibility is marked by calling
 * ecfs_fc_mark_ineligible(): This makes next fast commit operation to fall back
 * to full commit.
 *
 * Atomicity of commits
 * --------------------
 * In order to guarantee atomicity during the commit operation, fast commit
 * uses "ECFS_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
 * tag contains CRC of the contents and TID of the transaction after which
 * this fast commit should be applied. Recovery code replays fast commit
 * logs only if there's at least 1 valid tail present. For every fast commit
 * operation, there is 1 tail. This means, we may end up with multiple tails
 * in the fast commit space. Here's an example:
 *
 * - Create a new file A and remove existing file B
 * - fsync()
 * - Append contents to file A
 * - Truncate file A
 * - fsync()
 *
 * The fast commit space at the end of above operations would look like this:
 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 *
 * Replay code should thus check for all the valid tails in the FC area.
 *
 * Fast Commit Replay Idempotence
 * ------------------------------
 *
 * Fast commits tags are idempotent in nature provided the recovery code follows
 * certain rules. The guiding principle that the commit path follows while
 * committing is that it stores the result of a particular operation instead of
 * storing the procedure.
 *
 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 * was associated with inode 10. During fast commit, instead of storing this
 * operation as a procedure "rename a to b", we store the resulting file system
 * state as a "series" of outcomes:
 *
 * - Link dirent b to inode 10
 * - Unlink dirent a
 * - Inode <10> with valid refcount
 *
 * Now when recovery code runs, it needs "enforce" this state on the file
 * system. This is what guarantees idempotence of fast commit replay.
 *
 * Let's take an example of a procedure that is not idempotent and see how fast
 * commits make it idempotent. Consider following sequence of operations:
 *
 *     rm A;    mv B A;    read A
 *  (x)     (y)        (z)
 *
 * (x), (y) and (z) are the points at which we can crash. If we store this
 * sequence of operations as is then the replay is not idempotent. Let's say
 * while in replay, we crash at (z). During the second replay, file A (which was
 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 * file named A would be absent when we try to read A. So, this sequence of
 * operations is not idempotent. However, as mentioned above, instead of storing
 * the procedure fast commits store the outcome of each procedure. Thus the fast
 * commit log for above procedure would be as follows:
 *
 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 * inode 11 before the replay)
 *
 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 * (w)          (x)                    (y)          (z)
 *
 * If we crash at (z), we will have file A linked to inode 11. During the second
 * replay, we will remove file A (inode 11). But we will create it back and make
 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 * similarly. Thus, by converting a non-idempotent procedure into a series of
 * idempotent outcomes, fast commits ensured idempotence during the replay.
 *
 * Locking
 * -------
 * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
 * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
 * inode. Most of the code avoids acquiring both the locks, but if one must do
 * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
 *
 * TODOs
 * -----
 *
 * 0) Fast commit replay path hardening: Fast commit replay code should use
 *    journal handles to make sure all the updates it does during the replay
 *    path are atomic. With that if we crash during fast commit replay, after
 *    trying to do recovery again, we will find a file system where fast commit
 *    area is invalid (because new full commit would be found). In order to deal
 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 *    superblock state is persisted before starting the replay, so that after
 *    the crash, fast commit recovery code can look at that flag and perform
 *    fast commit recovery even if that area is invalidated by later full
 *    commits.
 *
 * 1) Handle more ineligible cases.
 *
 * 2) Change ecfs_fc_commit() to lookup logical to physical mapping using extent
 *    status tree. This would get rid of the need to call ecfs_fc_track_inode()
 *    before acquiring i_data_sem. To do that we would need to ensure that
 *    modified extents from the extent status tree are not evicted from memory.
 */

#include <trace/events/ecfs.h>
static struct kmem_cache *ecfs_fc_dentry_cachep;

static void ecfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{
	BUFFER_TRACE(bh, "");
	if (uptodate) {
		ecfs_debug("%s: Block %lld up-to-date",
			   __func__, bh->b_blocknr);
		set_buffer_uptodate(bh);
	} else {
		ecfs_debug("%s: Block %lld not up-to-date",
			   __func__, bh->b_blocknr);
		clear_buffer_uptodate(bh);
	}

	unlock_buffer(bh);
}

static inline void ecfs_fc_reset_inode(struct inode *inode)
{
	struct ecfs_inode_info *ei = ECFS_I(inode);

	ei->i_fc_lblk_start = 0;
	ei->i_fc_lblk_len = 0;
}

void ecfs_fc_init_inode(struct inode *inode)
{
	struct ecfs_inode_info *ei = ECFS_I(inode);

	ecfs_fc_reset_inode(inode);
	ecfs_clear_inode_state(inode, ECFS_STATE_FC_COMMITTING);
	INIT_LIST_HEAD(&ei->i_fc_list);
	INIT_LIST_HEAD(&ei->i_fc_dilist);
	init_waitqueue_head(&ei->i_fc_wait);
}

static bool ecfs_fc_disabled(struct super_block *sb)
{
	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
		(ECFS_SB(sb)->s_mount_state & ECFS_FC_REPLAY));
}

/*
 * Remove inode from fast commit list. If the inode is being committed
 * we wait until inode commit is done.
 */
void ecfs_fc_del(struct inode *inode)
{
	struct ecfs_inode_info *ei = ECFS_I(inode);
	struct ecfs_sb_info *sbi = ECFS_SB(inode->i_sb);
	struct ecfs_fc_dentry_update *fc_dentry;
	wait_queue_head_t *wq;

	if (ecfs_fc_disabled(inode->i_sb))
		return;

	mutex_lock(&sbi->s_fc_lock);
	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
		mutex_unlock(&sbi->s_fc_lock);
		return;
	}

	/*
	 * Since ecfs_fc_del is called from ecfs_evict_inode while having a
	 * handle open, there is no need for us to wait here even if a fast
	 * commit is going on. That is because, if this inode is being
	 * committed, ecfs_mark_inode_dirty would have waited for inode commit
	 * operation to finish before we come here. So, by the time we come
	 * here, inode's ECFS_STATE_FC_COMMITTING would have been cleared. So,
	 * we shouldn't see ECFS_STATE_FC_COMMITTING to be set on this inode
	 * here.
	 *
	 * We may come here without any handles open in the "no_delete" case of
	 * ecfs_evict_inode as well. However, if that happens, we first mark the
	 * file system as fast commit ineligible anyway. So, even in that case,
	 * it is okay to remove the inode from the fc list.
	 */
	WARN_ON(ecfs_test_inode_state(inode, ECFS_STATE_FC_COMMITTING)
		&& !ecfs_test_mount_flag(inode->i_sb, ECFS_MF_FC_INELIGIBLE));
	while (ecfs_test_inode_state(inode, ECFS_STATE_FC_FLUSHING_DATA)) {
#if (BITS_PER_LONG < 64)
		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
				ECFS_STATE_FC_FLUSHING_DATA);
		wq = bit_waitqueue(&ei->i_state_flags,
				   ECFS_STATE_FC_FLUSHING_DATA);
#else
		DEFINE_WAIT_BIT(wait, &ei->i_flags,
				ECFS_STATE_FC_FLUSHING_DATA);
		wq = bit_waitqueue(&ei->i_flags,
				   ECFS_STATE_FC_FLUSHING_DATA);
#endif
		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
		if (ecfs_test_inode_state(inode, ECFS_STATE_FC_FLUSHING_DATA)) {
			mutex_unlock(&sbi->s_fc_lock);
			schedule();
			mutex_lock(&sbi->s_fc_lock);
		}
		finish_wait(wq, &wait.wq_entry);
	}
	list_del_init(&ei->i_fc_list);

	/*
	 * Since this inode is getting removed, let's also remove all FC
	 * dentry create references, since it is not needed to log it anyways.
	 */
	if (list_empty(&ei->i_fc_dilist)) {
		mutex_unlock(&sbi->s_fc_lock);
		return;
	}

	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ecfs_fc_dentry_update, fcd_dilist);
	WARN_ON(fc_dentry->fcd_op != ECFS_FC_TAG_CREAT);
	list_del_init(&fc_dentry->fcd_list);
	list_del_init(&fc_dentry->fcd_dilist);

	WARN_ON(!list_empty(&ei->i_fc_dilist));
	mutex_unlock(&sbi->s_fc_lock);

	release_dentry_name_snapshot(&fc_dentry->fcd_name);
	kmem_cache_free(ecfs_fc_dentry_cachep, fc_dentry);
}

/*
 * Mark file system as fast commit ineligible, and record latest
 * ineligible transaction tid. This means until the recorded
 * transaction, commit operation would result in a full jbd2 commit.
 */
void ecfs_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
{
	struct ecfs_sb_info *sbi = ECFS_SB(sb);
	tid_t tid;
	bool has_transaction = true;
	bool is_ineligible;

	if (ecfs_fc_disabled(sb))
		return;

	if (handle && !IS_ERR(handle))
		tid = handle->h_transaction->t_tid;
	else {
		read_lock(&sbi->s_journal->j_state_lock);
		if (sbi->s_journal->j_running_transaction)
			tid = sbi->s_journal->j_running_transaction->t_tid;
		else
			has_transaction = false;
		read_unlock(&sbi->s_journal->j_state_lock);
	}
	mutex_lock(&sbi->s_fc_lock);
	is_ineligible = ecfs_test_mount_flag(sb, ECFS_MF_FC_INELIGIBLE);
	if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
		sbi->s_fc_ineligible_tid = tid;
	ecfs_set_mount_flag(sb, ECFS_MF_FC_INELIGIBLE);
	mutex_unlock(&sbi->s_fc_lock);
	WARN_ON(reason >= ECFS_FC_REASON_MAX);
	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}

/*
 * Generic fast commit tracking function. If this is the first time this we are
 * called after a full commit, we initialize fast commit fields and then call
 * __fc_track_fn() with update = 0. If we have already been called after a full
 * commit, we pass update = 1. Based on that, the track function can determine
 * if it needs to track a field for the first time or if it needs to just
 * update the previously tracked value.
 *
 * If enqueue is set, this function enqueues the inode in fast commit list.
 */
static int ecfs_fc_track_template(
	handle_t *handle, struct inode *inode,
	int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
	void *args, int enqueue)
{
	bool update = false;
	struct ecfs_inode_info *ei = ECFS_I(inode);
	struct ecfs_sb_info *sbi = ECFS_SB(inode->i_sb);
	tid_t tid = 0;
	int ret;

	tid = handle->h_transaction->t_tid;
	spin_lock(&ei->i_fc_lock);
	if (tid == ei->i_sync_tid) {
		update = true;
	} else {
		ecfs_fc_reset_inode(inode);
		ei->i_sync_tid = tid;
	}
	ret = __fc_track_fn(handle, inode, args, update);
	spin_unlock(&ei->i_fc_lock);
	if (!enqueue)
		return ret;

	mutex_lock(&sbi->s_fc_lock);
	if (list_empty(&ECFS_I(inode)->i_fc_list))
		list_add_tail(&ECFS_I(inode)->i_fc_list,
				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
				&sbi->s_fc_q[FC_Q_STAGING] :
				&sbi->s_fc_q[FC_Q_MAIN]);
	mutex_unlock(&sbi->s_fc_lock);

	return ret;
}

struct __track_dentry_update_args {
	struct dentry *dentry;
	int op;
};

/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
static int __track_dentry_update(handle_t *handle, struct inode *inode,
				 void *arg, bool update)
{
	struct ecfs_fc_dentry_update *node;
	struct ecfs_inode_info *ei = ECFS_I(inode);
	struct __track_dentry_update_args *dentry_update =
		(struct __track_dentry_update_args *)arg;
	struct dentry *dentry = dentry_update->dentry;
	struct inode *dir = dentry->d_parent->d_inode;
	struct super_block *sb = inode->i_sb;
	struct ecfs_sb_info *sbi = ECFS_SB(sb);

	spin_unlock(&ei->i_fc_lock);

	if (IS_ENCRYPTED(dir)) {
		ecfs_fc_mark_ineligible(sb, ECFS_FC_REASON_ENCRYPTED_FILENAME,
					handle);
		spin_lock(&ei->i_fc_lock);
		return -EOPNOTSUPP;
	}

	node = kmem_cache_alloc(ecfs_fc_dentry_cachep, GFP_NOFS);
	if (!node) {
		ecfs_fc_mark_ineligible(sb, ECFS_FC_REASON_NOMEM, handle);
		spin_lock(&ei->i_fc_lock);
		return -ENOMEM;
	}

	node->fcd_op = dentry_update->op;
	node->fcd_parent = dir->i_ino;
	node->fcd_ino = inode->i_ino;
	take_dentry_name_snapshot(&node->fcd_name, dentry);
	INIT_LIST_HEAD(&node->fcd_dilist);
	INIT_LIST_HEAD(&node->fcd_list);
	mutex_lock(&sbi->s_fc_lock);
	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
		list_add_tail(&node->fcd_list,
				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
	else
		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);

	/*
	 * This helps us keep a track of all fc_dentry updates which is part of
	 * this ecfs inode. So in case the inode is getting unlinked, before
	 * even we get a chance to fsync, we could remove all fc_dentry
	 * references while evicting the inode in ecfs_fc_del().
	 * Also with this, we don't need to loop over all the inodes in
	 * sbi->s_fc_q to get the corresponding inode in
	 * ecfs_fc_commit_dentry_updates().
	 */
	if (dentry_update->op == ECFS_FC_TAG_CREAT) {
		WARN_ON(!list_empty(&ei->i_fc_dilist));
		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
	}
	mutex_unlock(&sbi->s_fc_lock);
	spin_lock(&ei->i_fc_lock);

	return 0;
}

void __ecfs_fc_track_unlink(handle_t *handle,
		struct inode *inode, struct dentry *dentry)
{
	struct __track_dentry_update_args args;
	int ret;

	args.dentry = dentry;
	args.op = ECFS_FC_TAG_UNLINK;

	ret = ecfs_fc_track_template(handle, inode, __track_dentry_update,
					(void *)&args, 0);
	trace_ecfs_fc_track_unlink(handle, inode, dentry, ret);
}

void ecfs_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
	struct inode *inode = d_inode(dentry);

	if (ecfs_fc_disabled(inode->i_sb))
		return;

	if (ecfs_test_mount_flag(inode->i_sb, ECFS_MF_FC_INELIGIBLE))
		return;

	__ecfs_fc_track_unlink(handle, inode, dentry);
}

void __ecfs_fc_track_link(handle_t *handle,
	struct inode *inode, struct dentry *dentry)
{
	struct __track_dentry_update_args args;
	int ret;

	args.dentry = dentry;
	args.op = ECFS_FC_TAG_LINK;

	ret = ecfs_fc_track_template(handle, inode, __track_dentry_update,
					(void *)&args, 0);
	trace_ecfs_fc_track_link(handle, inode, dentry, ret);
}

void ecfs_fc_track_link(handle_t *handle, struct dentry *dentry)
{
	struct inode *inode = d_inode(dentry);

	if (ecfs_fc_disabled(inode->i_sb))
		return;

	if (ecfs_test_mount_flag(inode->i_sb, ECFS_MF_FC_INELIGIBLE))
		return;

	__ecfs_fc_track_link(handle, inode, dentry);
}

void __ecfs_fc_track_create(handle_t *handle, struct inode *inode,
			  struct dentry *dentry)
{
	struct __track_dentry_update_args args;
	int ret;

	args.dentry = dentry;
	args.op = ECFS_FC_TAG_CREAT;

	ret = ecfs_fc_track_template(handle, inode, __track_dentry_update,
					(void *)&args, 0);
	trace_ecfs_fc_track_create(handle, inode, dentry, ret);
}

void ecfs_fc_track_create(handle_t *handle, struct dentry *dentry)
{
	struct inode *inode = d_inode(dentry);

	if (ecfs_fc_disabled(inode->i_sb))
		return;

	if (ecfs_test_mount_flag(inode->i_sb, ECFS_MF_FC_INELIGIBLE))
		return;

	__ecfs_fc_track_create(handle, inode, dentry);
}

/* __track_fn for inode tracking */
static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
			 bool update)
{
	if (update)
		return -EEXIST;

	ECFS_I(inode)->i_fc_lblk_len = 0;

	return 0;
}

void ecfs_fc_track_inode(handle_t *handle, struct inode *inode)
{
	struct ecfs_inode_info *ei = ECFS_I(inode);
	wait_queue_head_t *wq;
	int ret;

	if (S_ISDIR(inode->i_mode))
		return;

	if (ecfs_fc_disabled(inode->i_sb))
		return;

	if (ecfs_should_journal_data(inode)) {
		ecfs_fc_mark_ineligible(inode->i_sb,
					ECFS_FC_REASON_INODE_JOURNAL_DATA, handle);
		return;
	}

	if (ecfs_test_mount_flag(inode->i_sb, ECFS_MF_FC_INELIGIBLE))
		return;

	/*
	 * If we come here, we may sleep while waiting for the inode to
	 * commit. We shouldn't be holding i_data_sem when we go to sleep since
	 * the commit path needs to grab the lock while committing the inode.
	 */
	lockdep_assert_not_held(&ei->i_data_sem);

	while (ecfs_test_inode_state(inode, ECFS_STATE_FC_COMMITTING)) {
#if (BITS_PER_LONG < 64)
		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
				ECFS_STATE_FC_COMMITTING);
		wq = bit_waitqueue(&ei->i_state_flags,
				   ECFS_STATE_FC_COMMITTING);
#else
		DEFINE_WAIT_BIT(wait, &ei->i_flags,
				ECFS_STATE_FC_COMMITTING);
		wq = bit_waitqueue(&ei->i_flags,
				   ECFS_STATE_FC_COMMITTING);
#endif
		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
		if (ecfs_test_inode_state(inode, ECFS_STATE_FC_COMMITTING))
			schedule();
		finish_wait(wq, &wait.wq_entry);
	}

	/*
	 * From this point on, this inode will not be committed either
	 * by fast or full commit as long as the handle is open.
	 */
	ret = ecfs_fc_track_template(handle, inode, __track_inode, NULL, 1);
	trace_ecfs_fc_track_inode(handle, inode, ret);
}

struct __track_range_args {
	ecfs_lblk_t start, end;
};

/* __track_fn for tracking data updates */
static int __track_range(handle_t *handle, struct inode *inode, void *arg,
			 bool update)
{
	struct ecfs_inode_info *ei = ECFS_I(inode);
	ecfs_lblk_t oldstart;
	struct __track_range_args *__arg =
		(struct __track_range_args *)arg;

	if (gid_get_lid(inode->i_ino) < ECFS_FIRST_INO(inode->i_sb)) { /* s_first_ino is 32 bit.*/
		ecfs_debug("Special inode %ld being modified\n", inode->i_ino);
		return -ECANCELED;
	}

	oldstart = ei->i_fc_lblk_start;

	if (update && ei->i_fc_lblk_len > 0) {
		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
		ei->i_fc_lblk_len =
			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
				ei->i_fc_lblk_start + 1;
	} else {
		ei->i_fc_lblk_start = __arg->start;
		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
	}

	return 0;
}

void ecfs_fc_track_range(handle_t *handle, struct inode *inode, ecfs_lblk_t start,
			 ecfs_lblk_t end)
{
	struct __track_range_args args;
	int ret;

	if (S_ISDIR(inode->i_mode))
		return;

	if (ecfs_fc_disabled(inode->i_sb))
		return;

	if (ecfs_test_mount_flag(inode->i_sb, ECFS_MF_FC_INELIGIBLE))
		return;

	if (ecfs_has_inline_data(inode)) {
		ecfs_fc_mark_ineligible(inode->i_sb, ECFS_FC_REASON_XATTR,
					handle);
		return;
	}

	args.start = start;
	args.end = end;

	ret = ecfs_fc_track_template(handle, inode,  __track_range, &args, 1);

	trace_ecfs_fc_track_range(handle, inode, start, end, ret);
}

static void ecfs_fc_submit_bh(struct super_block *sb, bool is_tail)
{
	blk_opf_t write_flags = REQ_SYNC;
	struct buffer_head *bh = ECFS_SB(sb)->s_fc_bh;

	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
	if (test_opt(sb, BARRIER) && is_tail)
		write_flags |= REQ_FUA | REQ_PREFLUSH;
	lock_buffer(bh);
	set_buffer_dirty(bh);
	set_buffer_uptodate(bh);
	bh->b_end_io = ecfs_end_buffer_io_sync;
	submit_bh(REQ_OP_WRITE | write_flags, bh);
	ECFS_SB(sb)->s_fc_bh = NULL;
}

/* Ecfs commit path routines */

/*
 * Allocate len bytes on a fast commit buffer.
 *
 * During the commit time this function is used to manage fast commit
 * block space. We don't split a fast commit log onto different
 * blocks. So this function makes sure that if there's not enough space
 * on the current block, the remaining space in the current block is
 * marked as unused by adding ECFS_FC_TAG_PAD tag. In that case,
 * new block is from jbd2 and CRC is updated to reflect the padding
 * we added.
 */
static u8 *ecfs_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
{
	struct ecfs_fc_tl tl;
	struct ecfs_sb_info *sbi = ECFS_SB(sb);
	struct buffer_head *bh;
	int bsize = sbi->s_journal->j_blocksize;
	int ret, off = sbi->s_fc_bytes % bsize;
	int remaining;
	u8 *dst;

	/*
	 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
	 * cannot fulfill the request.
	 */
	if (len > bsize - ECFS_FC_TAG_BASE_LEN)
		return NULL;

	if (!sbi->s_fc_bh) {
		ret = jbd2_fc_get_buf(ECFS_SB(sb)->s_journal, &bh);
		if (ret)
			return NULL;
		sbi->s_fc_bh = bh;
	}
	dst = sbi->s_fc_bh->b_data + off;

	/*
	 * Allocate the bytes in the current block if we can do so while still
	 * leaving enough space for a PAD tlv.
	 */
	remaining = bsize - ECFS_FC_TAG_BASE_LEN - off;
	if (len <= remaining) {
		sbi->s_fc_bytes += len;
		return dst;
	}

	/*
	 * Else, terminate the current block with a PAD tlv, then allocate a new
	 * block and allocate the bytes at the start of that new block.
	 */

	tl.fc_tag = cpu_to_le16(ECFS_FC_TAG_PAD);
	tl.fc_len = cpu_to_le16(remaining);
	memcpy(dst, &tl, ECFS_FC_TAG_BASE_LEN);
	memset(dst + ECFS_FC_TAG_BASE_LEN, 0, remaining);
	*crc = ecfs_chksum(*crc, sbi->s_fc_bh->b_data, bsize);

	ecfs_fc_submit_bh(sb, false);

	ret = jbd2_fc_get_buf(ECFS_SB(sb)->s_journal, &bh);
	if (ret)
		return NULL;
	sbi->s_fc_bh = bh;
	sbi->s_fc_bytes += bsize - off + len;
	return sbi->s_fc_bh->b_data;
}

/*
 * Complete a fast commit by writing tail tag.
 *
 * Writing tail tag marks the end of a fast commit. In order to guarantee
 * atomicity, after writing tail tag, even if there's space remaining
 * in the block, next commit shouldn't use it. That's why tail tag
 * has the length as that of the remaining space on the block.
 */
static int ecfs_fc_write_tail(struct super_block *sb, u32 crc)
{
	struct ecfs_sb_info *sbi = ECFS_SB(sb);
	struct ecfs_fc_tl tl;
	struct ecfs_fc_tail tail;
	int off, bsize = sbi->s_journal->j_blocksize;
	u8 *dst;

	/*
	 * ecfs_fc_reserve_space takes care of allocating an extra block if
	 * there's no enough space on this block for accommodating this tail.
	 */
	dst = ecfs_fc_reserve_space(sb, ECFS_FC_TAG_BASE_LEN + sizeof(tail), &crc);
	if (!dst)
		return -ENOSPC;

	off = sbi->s_fc_bytes % bsize;

	tl.fc_tag = cpu_to_le16(ECFS_FC_TAG_TAIL);
	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ecfs_fc_tail));
	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);

	memcpy(dst, &tl, ECFS_FC_TAG_BASE_LEN);
	dst += ECFS_FC_TAG_BASE_LEN;
	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
	memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
	dst += sizeof(tail.fc_tid);
	crc = ecfs_chksum(crc, sbi->s_fc_bh->b_data,
			  dst - (u8 *)sbi->s_fc_bh->b_data);
	tail.fc_crc = cpu_to_le32(crc);
	memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
	dst += sizeof(tail.fc_crc);
	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */

	ecfs_fc_submit_bh(sb, true);

	return 0;
}

/*
 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 * Returns false if there's not enough space.
 */
static bool ecfs_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
			   u32 *crc)
{
	struct ecfs_fc_tl tl;
	u8 *dst;

	dst = ecfs_fc_reserve_space(sb, ECFS_FC_TAG_BASE_LEN + len, crc);
	if (!dst)
		return false;

	tl.fc_tag = cpu_to_le16(tag);
	tl.fc_len = cpu_to_le16(len);

	memcpy(dst, &tl, ECFS_FC_TAG_BASE_LEN);
	memcpy(dst + ECFS_FC_TAG_BASE_LEN, val, len);

	return true;
}

/* Same as above, but adds dentry tlv. */
static bool ecfs_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
				   struct ecfs_fc_dentry_update *fc_dentry)
{
	struct ecfs_fc_dentry_info fcd;
	struct ecfs_fc_tl tl;
	int dlen = fc_dentry->fcd_name.name.len;
	u8 *dst = ecfs_fc_reserve_space(sb,
			ECFS_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);

	if (!dst)
		return false;

	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
	memcpy(dst, &tl, ECFS_FC_TAG_BASE_LEN);
	dst += ECFS_FC_TAG_BASE_LEN;
	memcpy(dst, &fcd, sizeof(fcd));
	dst += sizeof(fcd);
	memcpy(dst, fc_dentry->fcd_name.name.name, dlen);

	return true;
}

/*
 * Writes inode in the fast commit space under TLV with tag @tag.
 * Returns 0 on success, error on failure.
 */
static int ecfs_fc_write_inode(struct inode *inode, u32 *crc)
{
	struct ecfs_inode_info *ei = ECFS_I(inode);
	int inode_len = ECFS_GOOD_OLD_INODE_SIZE;
	int ret;
	struct ecfs_iloc iloc;
	struct ecfs_fc_inode fc_inode;
	struct ecfs_fc_tl tl;
	u8 *dst;

	ret = ecfs_get_inode_loc(inode, &iloc);
	if (ret)
		return ret;

	if (ecfs_test_inode_flag(inode, ECFS_INODE_INLINE_DATA))
		inode_len = ECFS_INODE_SIZE(inode->i_sb);
	else if (ECFS_INODE_SIZE(inode->i_sb) > ECFS_GOOD_OLD_INODE_SIZE)
		inode_len += ei->i_extra_isize;

	fc_inode.fc_ino = cpu_to_le64(inode->i_ino);
	tl.fc_tag = cpu_to_le16(ECFS_FC_TAG_INODE);
	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));

	ret = -ECANCELED;
	dst = ecfs_fc_reserve_space(inode->i_sb,
		ECFS_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
	if (!dst)
		goto err;

	memcpy(dst, &tl, ECFS_FC_TAG_BASE_LEN);
	dst += ECFS_FC_TAG_BASE_LEN;
	memcpy(dst, &fc_inode, sizeof(fc_inode));
	dst += sizeof(fc_inode);
	memcpy(dst, (u8 *)ecfs_raw_inode(&iloc), inode_len);
	ret = 0;
err:
	brelse(iloc.bh);
	return ret;
}

/*
 * Writes updated data ranges for the inode in question. Updates CRC.
 * Returns 0 on success, error otherwise.
 */
static int ecfs_fc_write_inode_data(struct inode *inode, u32 *crc)
{
	ecfs_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
	struct ecfs_inode_info *ei = ECFS_I(inode);
	struct ecfs_map_blocks map;
	struct ecfs_fc_add_range fc_ext;
	struct ecfs_fc_del_range lrange;
	struct ecfs_extent *ex;
	int ret;

	spin_lock(&ei->i_fc_lock);
	if (ei->i_fc_lblk_len == 0) {
		spin_unlock(&ei->i_fc_lock);
		return 0;
	}
	old_blk_size = ei->i_fc_lblk_start;
	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
	ei->i_fc_lblk_len = 0;
	spin_unlock(&ei->i_fc_lock);

	cur_lblk_off = old_blk_size;
	ecfs_debug("will try writing %lld to %lld for inode %ld\n",
		   cur_lblk_off, new_blk_size, inode->i_ino);

	while (cur_lblk_off <= new_blk_size) {
		map.m_lblk = cur_lblk_off;
		map.m_len = new_blk_size - cur_lblk_off + 1;
		ret = ecfs_map_blocks(NULL, inode, &map,
				      ECFS_GET_BLOCKS_IO_SUBMIT |
				      ECFS_EX_NOCACHE);
		if (ret < 0)
			return -ECANCELED;

		if (map.m_len == 0) {
			cur_lblk_off++;
			continue;
		}

		if (ret == 0) {
			lrange.fc_ino = cpu_to_le64(inode->i_ino);
			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
			lrange.fc_len = cpu_to_le32(map.m_len);
			if (!ecfs_fc_add_tlv(inode->i_sb, ECFS_FC_TAG_DEL_RANGE,
					    sizeof(lrange), (u8 *)&lrange, crc))
				return -ENOSPC;
		} else {
			unsigned int max = (map.m_flags & ECFS_MAP_UNWRITTEN) ?
				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;

			/* Limit the number of blocks in one extent */
			map.m_len = min(max, map.m_len);

			fc_ext.fc_ino = cpu_to_le64(inode->i_ino);
			ex = (struct ecfs_extent *)&fc_ext.fc_ex;
			ex->ee_block = cpu_to_le32(map.m_lblk);
			ex->ee_len = cpu_to_le16(map.m_len);
			ecfs_ext_store_pblock(ex, map.m_pblk);
			if (map.m_flags & ECFS_MAP_UNWRITTEN)
				ecfs_ext_mark_unwritten(ex);
			else
				ecfs_ext_mark_initialized(ex);
			if (!ecfs_fc_add_tlv(inode->i_sb, ECFS_FC_TAG_ADD_RANGE,
					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
				return -ENOSPC;
		}

		cur_lblk_off += map.m_len;
	}

	return 0;
}


/* Flushes data of all the inodes in the commit queue. */
static int ecfs_fc_flush_data(journal_t *journal)
{
	struct super_block *sb = journal->j_private;
	struct ecfs_sb_info *sbi = ECFS_SB(sb);
	struct ecfs_inode_info *ei;
	int ret = 0;

	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		ret = jbd2_submit_inode_data(journal, ei->jinode);
		if (ret)
			return ret;
	}

	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
		ret = jbd2_wait_inode_data(journal, ei->jinode);
		if (ret)
			return ret;
	}

	return 0;
}

/* Commit all the directory entry updates */
static int ecfs_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
{
	struct super_block *sb = journal->j_private;
	struct ecfs_sb_info *sbi = ECFS_SB(sb);
	struct ecfs_fc_dentry_update *fc_dentry, *fc_dentry_n;
	struct inode *inode;
	struct ecfs_inode_info *ei;
	int ret;

	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
		return 0;